Skip to content

Commit

Permalink
Add str.edit_distance_matrix (#8463)
Browse files Browse the repository at this point in the history
This PR plumbs nvtext's `edit_distance_matrix` to cudf python with necessary precondition checks. It also adds python tests.

Closes #6341

Authors:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #8463
  • Loading branch information
isVoid authored Jun 10, 2021
1 parent 3fd3624 commit b895396
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 2 deletions.
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
const column_view & strings,
const column_view & targets
) except +

cdef unique_ptr[column] edit_distance_matrix(
const column_view & strings
) except +
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ from libcpp.utility cimport move
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix
)
from cudf._lib.column cimport Column

Expand All @@ -21,3 +22,13 @@ def edit_distance(Column strings, Column targets):
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_unique_ptr(move(c_result))


def edit_distance_matrix(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_unique_ptr(move(c_result))
46 changes: 45 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
from cudf import _lib as libcudf
from cudf._lib import string_casting as str_cast
from cudf._lib.column import Column
from cudf._lib.nvtext.edit_distance import edit_distance as cpp_edit_distance
from cudf._lib.nvtext.edit_distance import (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)
from cudf._lib.nvtext.generate_ngrams import (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
Expand Down Expand Up @@ -4859,6 +4862,47 @@ def edit_distance(self, targets) -> ParentType:
cpp_edit_distance(self._column, targets_column)
)

def edit_distance_matrix(self) -> ParentType:
"""Computes the edit distance between strings in the series.
The series to compute the matrix should have more than 2 strings and
should not contain nulls.
Edit distance is measured based on the `Levenshtein edit distance
algorithm
<https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
Returns
-------
Series of ListDtype(int64)
Assume `N` is the length of this series. The return series contains
`N` lists of size `N`, where the `j`th number in the `i`th row of
the series tells the edit distance between the `i`th string and the
`j`th string of this series.
The matrix is symmetric. Diagonal elements are 0.
Examples
--------
>>> import cudf
>>> s = cudf.Series(['abc', 'bc', 'cba'])
>>> s.str.edit_distance_matrix()
0 [0, 1, 2]
1 [1, 0, 2]
2 [2, 2, 0]
dtype: list
"""
if self._column.size < 2:
raise ValueError(
"Require size >= 2 to compute edit distance matrix."
)
if self._column.has_nulls:
raise ValueError(
"Cannot compute edit distance between null strings. "
"Consider removing them using `dropna` or fill with `fillna`."
)
return self._return_or_inplace(cpp_edit_distance_matrix(self._column))


def _massage_string_arg(value, name, allow_col=False):
if isinstance(value, str):
Expand Down
28 changes: 28 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,34 @@ def test_edit_distance():
assert_eq(expected, actual)


def test_edit_distance_matrix():
# normal
sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"])

expected = cudf.Series(
[
[0, 1, 3, 3, 3],
[1, 0, 2, 4, 3],
[3, 2, 0, 2, 1],
[3, 4, 2, 0, 2],
[3, 3, 1, 2, 0],
]
)
got = sr.str.edit_distance_matrix()

assert_eq(expected, got, check_dtype=False)

# 1-row series
sr2 = cudf.Series(["x"])
with pytest.raises(ValueError, match="Require size >= 2"):
sr2.str.edit_distance_matrix()

# null rows
sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"])
with pytest.raises(ValueError, match="Cannot compute"):
sr3.str.edit_distance_matrix()


def test_porter_stemmer_measure():
strings = cudf.Series(
[
Expand Down

0 comments on commit b895396

Please sign in to comment.