From b895396d5cff7347fa7dde84d5b3aa1d2f649e08 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 10 Jun 2021 08:29:53 -0700 Subject: [PATCH] Add `str.edit_distance_matrix` (#8463) This PR plumbs nvtext's `edit_distance_matrix` to cudf python with necessary precondition checks. It also adds python tests. Closes #6341 Authors: - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Marlene (https://github.com/marlenezw) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/8463 --- .../cudf/_lib/cpp/nvtext/edit_distance.pxd | 4 ++ .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 13 +++++- python/cudf/cudf/core/column/string.py | 46 ++++++++++++++++++- python/cudf/cudf/tests/test_text.py | 28 +++++++++++ 4 files changed, 89 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd b/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd index 2a27cd3c338..0d846702c9d 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd @@ -12,3 +12,7 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil: const column_view & strings, const column_view & targets ) except + + + cdef unique_ptr[column] edit_distance_matrix( + const column_view & strings + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index f9fae570469..a1e59585df2 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -7,7 +7,8 @@ from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix ) from cudf._lib.column cimport Column @@ -21,3 +22,13 @@ def edit_distance(Column strings, Column targets): c_result = move(cpp_edit_distance(c_strings, c_targets)) return Column.from_unique_ptr(move(c_result)) + + +def edit_distance_matrix(Column strings): + cdef column_view c_strings = strings.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance_matrix(c_strings)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9fbd237b518..d5e75039070 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -17,7 +17,10 @@ from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast from cudf._lib.column import Column -from cudf._lib.nvtext.edit_distance import edit_distance as cpp_edit_distance +from cudf._lib.nvtext.edit_distance import ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams as cpp_generate_character_ngrams, generate_ngrams as cpp_generate_ngrams, @@ -4859,6 +4862,47 @@ def edit_distance(self, targets) -> ParentType: cpp_edit_distance(self._column, targets_column) ) + def edit_distance_matrix(self) -> ParentType: + """Computes the edit distance between strings in the series. + + The series to compute the matrix should have more than 2 strings and + should not contain nulls. + + Edit distance is measured based on the `Levenshtein edit distance + algorithm + `_. + + + Returns + ------- + Series of ListDtype(int64) + Assume `N` is the length of this series. The return series contains + `N` lists of size `N`, where the `j`th number in the `i`th row of + the series tells the edit distance between the `i`th string and the + `j`th string of this series. + The matrix is symmetric. Diagonal elements are 0. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['abc', 'bc', 'cba']) + >>> s.str.edit_distance_matrix() + 0 [0, 1, 2] + 1 [1, 0, 2] + 2 [2, 2, 0] + dtype: list + """ + if self._column.size < 2: + raise ValueError( + "Require size >= 2 to compute edit distance matrix." + ) + if self._column.has_nulls: + raise ValueError( + "Cannot compute edit distance between null strings. " + "Consider removing them using `dropna` or fill with `fillna`." + ) + return self._return_or_inplace(cpp_edit_distance_matrix(self._column)) + def _massage_string_arg(value, name, allow_col=False): if isinstance(value, str): diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 74465c4a54d..072fc23abba 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -787,6 +787,34 @@ def test_edit_distance(): assert_eq(expected, actual) +def test_edit_distance_matrix(): + # normal + sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) + + expected = cudf.Series( + [ + [0, 1, 3, 3, 3], + [1, 0, 2, 4, 3], + [3, 2, 0, 2, 1], + [3, 4, 2, 0, 2], + [3, 3, 1, 2, 0], + ] + ) + got = sr.str.edit_distance_matrix() + + assert_eq(expected, got, check_dtype=False) + + # 1-row series + sr2 = cudf.Series(["x"]) + with pytest.raises(ValueError, match="Require size >= 2"): + sr2.str.edit_distance_matrix() + + # null rows + sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) + with pytest.raises(ValueError, match="Cannot compute"): + sr3.str.edit_distance_matrix() + + def test_porter_stemmer_measure(): strings = cudf.Series( [