Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add str.edit_distance_matrix #8463

Merged
merged 4 commits into from
Jun 10, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
const column_view & strings,
const column_view & targets
) except +

cdef unique_ptr[column] edit_distance_matrix(
const column_view & strings
) except +
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ from libcpp.utility cimport move
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix
)
from cudf._lib.column cimport Column

Expand All @@ -21,3 +22,13 @@ def edit_distance(Column strings, Column targets):
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_unique_ptr(move(c_result))


def edit_distance_matrix(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_unique_ptr(move(c_result))
49 changes: 48 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
from cudf import _lib as libcudf
from cudf._lib import string_casting as str_cast
from cudf._lib.column import Column
from cudf._lib.nvtext.edit_distance import edit_distance as cpp_edit_distance
from cudf._lib.nvtext.edit_distance import (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)
from cudf._lib.nvtext.generate_ngrams import (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
Expand Down Expand Up @@ -4859,6 +4862,50 @@ def edit_distance(self, targets) -> ParentType:
cpp_edit_distance(self._column, targets_column)
)

def edit_distance_matrix(self) -> ParentType:
"""Computes the edit distance between strings in the series.

The series to compute the matrix should have more than 2 strings and
should not contain nulls.

Edit distance is measured based on the Levenshtein edit distance
algorithm.
https://www.cuelogic.com/blog/the-levenshtein-algorithm

Parameters
----------
None, does not require input parameter.
isVoid marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
Series of ListDtype(int64)
Assume `N` is the length of this series. The return series contains
`N` lists of size `N`, where the `j`th number in the `i`th row of
the series tells the edit distance bwtween the `i`th string and the
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
`j`th string of this series.
The matrix is symmetric. Diagonal elements are 0.

Examples
--------
>>> import cudf
>>> s = cudf.Series(['abc', 'bc', 'cba'])
>>> s.str.edit_distance_matrix()
0 [0, 1, 2]
1 [1, 0, 2]
2 [2, 2, 0]
dtype: list
"""
if self._column.size < 2:
raise ValueError(
"Require size >= 2 to compute edit distance matrix."
)
if self._column.has_nulls:
raise ValueError(
"Cannot compute edit distance between null strings. "
"Consider removing them using `dropna` or fill with `fillna`."
)
return self._return_or_inplace(cpp_edit_distance_matrix(self._column))


def _massage_string_arg(value, name, allow_col=False):
if isinstance(value, str):
Expand Down
28 changes: 28 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,34 @@ def test_edit_distance():
assert_eq(expected, actual)


def test_edit_distance_matrix():
# normal
sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just wondering does this implementation also work for series containing lists of strings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't. It only accept string type series.


expected = cudf.Series(
[
[0, 1, 3, 3, 3],
[1, 0, 2, 4, 3],
[3, 2, 0, 2, 1],
[3, 4, 2, 0, 2],
[3, 3, 1, 2, 0],
]
)
got = sr.str.edit_distance_matrix()

assert_eq(expected, got, check_dtype=False)
marlenezw marked this conversation as resolved.
Show resolved Hide resolved

# 1-row series
sr2 = cudf.Series(["x"])
with pytest.raises(ValueError, match="Require size >= 2"):
sr2.str.edit_distance_matrix()

# null rows
sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"])
with pytest.raises(ValueError, match="Cannot compute"):
sr3.str.edit_distance_matrix()


def test_porter_stemmer_measure():
strings = cudf.Series(
[
Expand Down