From cfa2d513667edabda6c4487f15f251f757f0c94d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 22 Nov 2023 10:21:52 -0500 Subject: [PATCH] Expose stream parameter in public nvtext APIs (#14456) Add stream parameter to public APIs: - `nvtext::is_letter()` - `nvtext::porter_stemmer_measure` - `nvtext::edit_distance()` - `nvtext::edit_distance_matrix()` Also cleaned up some of the doxygen comments and added stream gtests. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/14456 --- cpp/include/nvtext/edit_distance.hpp | 42 +++++++++-------- cpp/include/nvtext/stemmer.hpp | 46 +++++++++++-------- cpp/src/text/edit_distance.cu | 10 ++-- cpp/src/text/stemmer.cu | 20 ++++---- cpp/tests/CMakeLists.txt | 10 +++- cpp/tests/streams/text/edit_distance_test.cpp | 33 +++++++++++++ cpp/tests/streams/text/stemmer_test.cpp | 42 +++++++++++++++++ 7 files changed, 148 insertions(+), 55 deletions(-) create mode 100644 cpp/tests/streams/text/edit_distance_test.cpp create mode 100644 cpp/tests/streams/text/stemmer_test.cpp diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 953ecf7734d..9a24662455b 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ namespace nvtext { /** * @brief Compute the edit distance between individual strings in two strings columns. * - * The `output[i]` is the edit distance between `strings[i]` and `targets[i]`. + * The `output[i]` is the edit distance between `input[i]` and `targets[i]`. * This edit distance calculation uses the Levenshtein algorithm as documented here: * https://www.cuelogic.com/blog/the-levenshtein-algorithm * @@ -42,23 +42,25 @@ namespace nvtext { * d is now [1, 7, 0] * @endcode * - * Any null entries for either `strings` or `targets` is ignored and the edit distance + * Any null entries for either `input` or `targets` is ignored and the edit distance * is computed as though the null entry is an empty string. * - * The `targets.size()` must equal `strings.size()` unless `targets.size()==1`. - * In this case, all `strings` will be computed against the single `targets[0]` string. + * The `targets.size()` must equal `input.size()` unless `targets.size()==1`. + * In this case, all `input` will be computed against the single `targets[0]` string. * - * @throw cudf::logic_error if `targets.size() != strings.size()` and + * @throw cudf::logic_error if `targets.size() != input.size()` and * if `targets.size() != 1` * - * @param strings Strings column of input strings - * @param targets Strings to compute edit distance against `strings` - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of with replaced strings. + * @param input Strings column of input strings + * @param targets Strings to compute edit distance against `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of with replaced strings */ std::unique_ptr edit_distance( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -67,7 +69,7 @@ std::unique_ptr edit_distance( * This uses the Levenshtein algorithm to calculate the edit distance between * two strings as documented here: https://www.cuelogic.com/blog/the-levenshtein-algorithm * - * The output is essentially a `strings.size() x strings.size()` square matrix of integers. + * The output is essentially a `input.size() x input.size()` square matrix of integers. * All values at diagonal `row == col` are 0 since the edit distance between two identical * strings is zero. All values above the diagonal are reflected below since the edit distance * calculation is also commutative. @@ -81,20 +83,22 @@ std::unique_ptr edit_distance( * [1, 2, 0]] * @endcode * - * Null entries for `strings` are ignored and the edit distance + * Null entries for `input` are ignored and the edit distance * is computed as though the null entry is an empty string. * - * The output is a lists column of size `strings.size()` and where each list item - * is `strings.size()` elements. + * The output is a lists column of size `input.size()` and where each list item + * is `input.size()` elements. * * @throw cudf::logic_error if `strings.size() == 1` * - * @param strings Strings column of input strings - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New lists column of edit distance values. + * @param input Strings column of input strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New lists column of edit distance values */ std::unique_ptr edit_distance_matrix( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp index 0a57f8944d4..0e1759fdc5a 100644 --- a/cpp/include/nvtext/stemmer.hpp +++ b/cpp/include/nvtext/stemmer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ enum class letter_type { * * A negative index value will check the character starting from the end * of each string. That is, for `character_index < 0` the letter checked for string - * `strings[i]` is at position `strings[i].length + index`. + * `input[i]` is at position `input[i].length + index`. * * @code{.pseudo} * Example: @@ -68,20 +68,22 @@ enum class letter_type { * A null input element at row `i` produces a corresponding null entry * for row `i` in the output column. * - * @param strings Strings column of words to measure. - * @param ltype Specify letter type to check. - * @param character_index The character position to check in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New BOOL column. + * @param input Strings column of words to measure + * @param ltype Specify letter type to check + * @param character_index The character position to check in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL column */ std::unique_ptr is_letter( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, letter_type ltype, cudf::size_type character_index, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns boolean column indicating if character at `indices[i]` of `strings[i]` + * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]` * is a consonant or vowel. * * Determining consonants and vowels is described in the following @@ -116,19 +118,21 @@ std::unique_ptr is_letter( * A null input element at row `i` produces a corresponding null entry * for row `i` in the output column. * - * @throw cudf::logic_error if `indices.size() != strings.size()` + * @throw cudf::logic_error if `indices.size() != input.size()` * @throw cudf::logic_error if `indices` contain nulls. * - * @param strings Strings column of words to measure. - * @param ltype Specify letter type to check. - * @param indices The character positions to check in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New BOOL column. + * @param input Strings column of words to measure + * @param ltype Specify letter type to check + * @param indices The character positions to check in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL column */ std::unique_ptr is_letter( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, letter_type ltype, cudf::column_view const& indices, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -155,12 +159,14 @@ std::unique_ptr is_letter( * A null input element at row `i` produces a corresponding null entry * for row `i` in the output column. * - * @param strings Strings column of words to measure. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New INT32 column of measure values. + * @param input Strings column of words to measure + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @return New INT32 column of measure values */ std::unique_ptr porter_stemmer_measure( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 3d5f2d72e6f..a1d97409987 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -298,22 +298,24 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con /** * @copydoc nvtext::edit_distance */ -std::unique_ptr edit_distance(cudf::strings_column_view const& strings, +std::unique_ptr edit_distance(cudf::strings_column_view const& input, cudf::strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::edit_distance(strings, targets, cudf::get_default_stream(), mr); + return detail::edit_distance(input, targets, stream, mr); } /** * @copydoc nvtext::edit_distance_matrix */ -std::unique_ptr edit_distance_matrix(cudf::strings_column_view const& strings, +std::unique_ptr edit_distance_matrix(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::edit_distance_matrix(strings, cudf::get_default_stream(), mr); + return detail::edit_distance_matrix(input, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index 2b2b8429d9c..bdcb0b2af32 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -250,36 +250,36 @@ std::unique_ptr is_letter(cudf::strings_column_view const& strings // external APIs -std::unique_ptr is_letter(cudf::strings_column_view const& strings, +std::unique_ptr is_letter(cudf::strings_column_view const& input, letter_type ltype, cudf::size_type character_index, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_letter(strings, - ltype, - thrust::make_constant_iterator(character_index), - cudf::get_default_stream(), - mr); + return detail::is_letter( + input, ltype, thrust::make_constant_iterator(character_index), stream, mr); } -std::unique_ptr is_letter(cudf::strings_column_view const& strings, +std::unique_ptr is_letter(cudf::strings_column_view const& input, letter_type ltype, cudf::column_view const& indices, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_letter(strings, ltype, indices, cudf::get_default_stream(), mr); + return detail::is_letter(input, ltype, indices, stream, mr); } /** * @copydoc nvtext::porter_stemmer_measure */ -std::unique_ptr porter_stemmer_measure(cudf::strings_column_view const& strings, +std::unique_ptr porter_stemmer_measure(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::porter_stemmer_measure(strings, cudf::get_default_stream(), mr); + return detail::porter_stemmer_measure(input, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1be8566fb0f..b35c72b9e9d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -664,8 +664,14 @@ ConfigureTest( testing ) ConfigureTest( - STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp - streams/text/tokenize_test.cpp STREAM_MODE testing + STREAM_TEXT_TEST + streams/text/edit_distance_test.cpp + streams/text/ngrams_test.cpp + streams/text/replace_test.cpp + streams/text/stemmer_test.cpp + streams/text/tokenize_test.cpp + STREAM_MODE + testing ) ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/text/edit_distance_test.cpp b/cpp/tests/streams/text/edit_distance_test.cpp new file mode 100644 index 00000000000..59206c39e69 --- /dev/null +++ b/cpp/tests/streams/text/edit_distance_test.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +class TextEditDistanceTest : public cudf::test::BaseFixture {}; + +TEST_F(TextEditDistanceTest, EditDistance) +{ + auto const input = cudf::test::strings_column_wrapper({"dog", "cat", "mouse", "pupper"}); + auto const input_view = cudf::strings_column_view(input); + auto const target = cudf::test::strings_column_wrapper({"hog", "cake", "house", "puppy"}); + auto const target_view = cudf::strings_column_view(target); + nvtext::edit_distance(input_view, target_view, cudf::test::get_default_stream()); + nvtext::edit_distance_matrix(input_view, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/text/stemmer_test.cpp b/cpp/tests/streams/text/stemmer_test.cpp new file mode 100644 index 00000000000..7aa51befa73 --- /dev/null +++ b/cpp/tests/streams/text/stemmer_test.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +class TextStemmerTest : public cudf::test::BaseFixture {}; + +TEST_F(TextStemmerTest, IsLetter) +{ + auto const input = + cudf::test::strings_column_wrapper({"abbey", "normal", "creates", "yearly", "trouble"}); + auto const view = cudf::strings_column_view(input); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::is_letter(view, nvtext::letter_type::VOWEL, 0, cudf::test::get_default_stream()); + auto const indices = cudf::test::fixed_width_column_wrapper({0, 1, 3, 5, 4}); + nvtext::is_letter(view, nvtext::letter_type::VOWEL, indices, cudf::test::get_default_stream()); +} + +TEST_F(TextStemmerTest, Porter) +{ + auto const input = + cudf::test::strings_column_wrapper({"abbey", "normal", "creates", "yearly", "trouble"}); + auto const view = cudf::strings_column_view(input); + nvtext::porter_stemmer_measure(view, cudf::test::get_default_stream()); +}