From 0b0b1f68d5747b1321d8ec35c2d9df9012bbab5e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 14 Oct 2022 17:04:29 -0400 Subject: [PATCH 01/11] Add regex_program class for use with all regex APIs --- cpp/CMakeLists.txt | 3 +- cpp/include/cudf/strings/contains.hpp | 81 +++++++ cpp/include/cudf/strings/extract.hpp | 68 ++++++ cpp/include/cudf/strings/findall.hpp | 36 +++ cpp/include/cudf/strings/regex/flags.hpp | 2 +- .../cudf/strings/regex/regex_program.hpp | 121 ++++++++++ cpp/include/cudf/strings/replace_re.hpp | 50 ++++ cpp/include/cudf/strings/split/split_re.hpp | 223 ++++++++++++++++++ cpp/include/doxygen_groups.h | 1 + cpp/src/strings/contains.cu | 56 +++-- cpp/src/strings/extract/extract.cu | 19 +- cpp/src/strings/extract/extract_all.cu | 20 +- cpp/src/strings/regex/regcomp.cpp | 2 +- cpp/src/strings/regex/regcomp.h | 2 +- cpp/src/strings/regex/regex.cuh | 40 ++-- cpp/src/strings/regex/regex_program.cpp | 64 +++++ cpp/src/strings/regex/regex_program_impl.h | 37 +++ .../strings/regex/{regexec.cu => regexec.cpp} | 25 +- cpp/src/strings/replace/backref_re.cu | 22 +- cpp/src/strings/replace/multi_re.cu | 4 +- cpp/src/strings/replace/replace_re.cu | 22 +- cpp/src/strings/search/findall.cu | 19 +- cpp/src/strings/split/split_re.cu | 84 +++++-- 23 files changed, 895 insertions(+), 106 deletions(-) create mode 100644 cpp/include/cudf/strings/regex/regex_program.hpp create mode 100644 cpp/src/strings/regex/regex_program.cpp create mode 100644 cpp/src/strings/regex/regex_program_impl.h rename cpp/src/strings/regex/{regexec.cu => regexec.cpp} (90%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bfabbbc625d..87bf23b9f84 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -504,7 +504,8 @@ add_library( src/strings/padding.cu src/strings/json/json_path.cu src/strings/regex/regcomp.cpp - src/strings/regex/regexec.cu + src/strings/regex/regexec.cpp + src/strings/regex/regex_program.cpp src/strings/repeat_strings.cu src/strings/replace/backref_re.cu src/strings/replace/multi_re.cu diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index d95dc2c418c..1718d205871 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -24,6 +24,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_contains * @{ @@ -58,6 +61,32 @@ std::unique_ptr contains_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying rows which + * match the given regex_program object + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = contains_re(s, p) + * r is now [false, true, true] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string + */ +std::unique_ptr contains_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a boolean column identifying rows which * matching the given regex pattern but only at the beginning the string. @@ -85,6 +114,32 @@ std::unique_ptr matches_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying rows which + * matching the given regex_program object but only at the beginning the string. + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = matches_re(s, p) + * r is now [false, true, false] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string + */ +std::unique_ptr matches_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the number of times the given regex pattern * matches in each string. @@ -112,6 +167,32 @@ std::unique_ptr count_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns the number of times the given regex_program's pattern + * matches in each string + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def45"] + * p = regex_program::create("\\d") + * r = count_re(s, p) + * r is now [0, 3, 2] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT32 column with counts for each string + */ +std::unique_ptr count_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a boolean column identifying rows which * match the given like pattern. diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index a30098bedb9..a80d971438d 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_substring * @{ @@ -61,6 +64,37 @@ std::unique_ptr extract( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a table of strings columns where each column corresponds to the matching + * group specified in the given regex_program object + * + * All the strings for the first group will go in the first output column; the second group + * go in the second column and so on. Null entries are added to the columns in row `i` if + * the string at row `i` does not match. + * + * Any null string entries return corresponding null output column entries. + * + * @code{.pseudo} + * Example: + * s = ["a1", "b2", "c3"] + * p = regex_program::create("([ab])(\\d)") + * r = extract(s, p) + * r is now [ ["a", "b", null], + * ["1", "2", null] ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned table's device memory + * @return Columns of strings extracted from the input column + */ +std::unique_ptr
extract( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a lists column of strings where each string column row corresponds to the * matching group specified in the given regular expression pattern. @@ -96,6 +130,40 @@ std::unique_ptr extract_all_record( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings where each string column row corresponds to the + * matching group specified in the given regex_program object + * + * All the matching groups for the first row will go in the first row output column; the second + * row results will go into the second row output column and so on. + * + * A null output row will result if the corresponding input string row does not match or + * that input row is null. + * + * @code{.pseudo} + * Example: + * s = ["a1 b4", "b2", "c3 a5", "b", null] + * p = regex_program::create("([ab])(\\d)") + * r = extract_all_record(s, p) + * r is now [ ["a", "1", "b", "4"], + * ["b", "2"], + * ["a", "5"], + * null, + * null ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate any returned device memory + * @return Lists column containing strings extracted from the input column + */ +std::unique_ptr extract_all_record( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 6969ba35b1b..366e1eb0482 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_contains * @{ @@ -63,6 +66,39 @@ std::unique_ptr findall( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings for each matching occurrence using + * the regex_program pattern within each string + * + * Each output row includes all the substrings within the corresponding input row + * that match the given pattern. If no matches are found, the output row is empty. + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[ab]") + * r = findall(s, p) + * r is now a lists column like: + * [ ["b"] + * ["a","b","b"] + * ["a"] + * [] ] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New lists column of strings + */ +std::unique_ptr findall( + strings_column_view const& input, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index 3a7051345fa..44ca68439e7 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -21,7 +21,7 @@ namespace cudf { namespace strings { /** - * @addtogroup strings_contains + * @addtogroup strings_regex * @{ */ diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp new file mode 100644 index 00000000000..cfe1f07e062 --- /dev/null +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_regex + * @{ + */ + +/** + * @brief Regex program class. + * + * Create an instance from a regex pattern and use it to call + * strings APIs. An instance can be reused. + * + * See the @ref md_regex "Regex Features" page for details on patterns and APIs the support regex. + */ +struct regex_program { + struct regex_program_impl; + + /** + * @brief Create a program from a pattern + * + * @param pattern Regex pattern + * @param flags Regex flags for interpreting special characters in the pattern + * @param capture Control how capture groups in the pattern are used + * @return Instance of this object + */ + static std::unique_ptr create(std::string_view pattern, + regex_flags flags = regex_flags::DEFAULT, + capture_groups capture = capture_groups::EXTRACT); + + regex_program(regex_program&& other); + regex_program& operator=(regex_program&& other); + + /** + * @brief Return the pattern used to create this instance + * + * @return regex pattern as a string + */ + std::string pattern() const; + + /** + * @brief Return the regex_flags used to create this instance + * + * @return regex flags setting + */ + regex_flags flags() const; + + /** + * @brief Return the capture_groups used to create this instance + * + * @return capture groups setting + */ + capture_groups capture() const; + + /** + * @brief Return the number of instructions in this instance + * + * @return Number of instructions + */ + int32_t instructions_count() const; + + /** + * @brief Return the number of capture groups in this instance + * + * @return Number of groups + */ + int32_t groups_count() const; + + /** + * @brief Return implementation object + * + * @return impl object instance + */ + regex_program_impl* get_impl() const; + + /** + * @brief Return the pattern used to create this instance + * + * @return regex pattern as a string + */ + std::size_t compute_working_memory_size(int32_t num_threads) const; + + private: + regex_program(); + + std::string _pattern; + regex_flags _flags; + capture_groups _capture; + + std::unique_ptr _impl; + + regex_program(std::string_view pattern, regex_flags flags, capture_groups capture); +}; + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index d80b9a89b81..60c66956fb8 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -26,6 +26,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_replace * @{ @@ -58,6 +61,30 @@ std::unique_ptr replace_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief For each string, replaces any character sequence matching the given regex + * with the provided replacement string. + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param replacement The string used to replace the matched sequence in each string. + * Default is an empty string. + * @param max_replace_count The maximum number of times to replace the matched pattern + * within each string. Default replaces every substring that is matched. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +std::unique_ptr replace_re( + strings_column_view const& strings, + regex_program const& prog, + string_scalar const& replacement = string_scalar(""), + std::optional max_replace_count = std::nullopt, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief For each string, replaces any character sequence matching the given patterns * with the corresponding string in the `replacements` column. @@ -105,5 +132,28 @@ std::unique_ptr replace_with_backrefs( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief For each string, replaces any character sequence matching the given regex + * using the replacement template for back-references. + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also + * if the index exceeds the group count specified in the pattern + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param replacement The replacement template for creating the output string + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +std::unique_ptr replace_with_backrefs( + strings_column_view const& strings, + regex_program const& prog, + std::string_view replacement, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 6fe07b0f5dc..c6bd1345ae6 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_split * @{ @@ -77,6 +80,58 @@ std::unique_ptr
split_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a table of strings columns + * using a regex_program's pattern to delimit each string + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = split_re(s, p1) + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * p2 = regex_program::create("[ _]") + * s2 = split_re(s, p2, 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings + */ +std::unique_ptr
split_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string starting from the end of the string. @@ -127,6 +182,60 @@ std::unique_ptr
rsplit_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a table of strings columns using a + * regex_program's pattern to delimit each string starting from the end of the string + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_re(s, p1) + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * p2 = regex_program::create("[ _]") + * s2 = rsplit_re(s, p2, 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. @@ -179,6 +288,62 @@ std::unique_ptr split_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a list column of strings + * using the given regex_program to delimit each string + * + * Each element generates an array of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = split_record_re(s, p1) + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * p2 = regex_program::create("[ _]") + * s2 = split_record_re(s, p2, 1) + * s2 is a lists column of strings: + * [ ["a", "bc def_g"], + * ["a", "_bc"], + * ["", "ab cd"], + * ["ab", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings. + */ +std::unique_ptr split_record_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string starting from the end of the string. @@ -233,6 +398,64 @@ std::unique_ptr rsplit_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a list column of strings using the given + * regex_program to delimit each string starting from the end of the string + * + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_record_re(s, p1) + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * p2 = regex_program::create("[ _]") + * s2 = rsplit_record_re(s, p2, 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index c0ea06959b2..5c335b720d5 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -129,6 +129,7 @@ * @defgroup strings_replace Replacing * @defgroup strings_split Splitting * @defgroup strings_json JSON + * @defgroup strings_regex Regex * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index b7d154c4808..e4f3e19cbac 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -57,8 +58,7 @@ struct contains_fn { }; std::unique_ptr contains_impl(strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, bool const beginning_only, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -71,7 +71,7 @@ std::unique_ptr contains_impl(strings_column_view const& input, mr); if (input.is_empty()) { return results; } - auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); + auto d_prog = prog.get_impl()->create_prog_device(stream); auto d_results = results->mutable_view().data(); auto const d_strings = column_device_view::create(input.parent(), stream); @@ -88,33 +88,30 @@ std::unique_ptr contains_impl(strings_column_view const& input, std::unique_ptr contains_re( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - return contains_impl(input, pattern, flags, false, stream, mr); + return contains_impl(input, prog, false, stream, mr); } std::unique_ptr matches_re( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - return contains_impl(input, pattern, flags, true, stream, mr); + return contains_impl(input, prog, true, stream, mr); } std::unique_ptr count_re( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); auto const d_strings = column_device_view::create(input.parent(), stream); @@ -136,7 +133,16 @@ std::unique_ptr contains_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_re(strings, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE); + return detail::contains_re(strings, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr contains_re(strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_re(strings, prog, cudf::default_stream_value, mr); } std::unique_ptr matches_re(strings_column_view const& strings, @@ -145,7 +151,16 @@ std::unique_ptr matches_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::matches_re(strings, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE); + return detail::matches_re(strings, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr matches_re(strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::matches_re(strings, prog, cudf::default_stream_value, mr); } std::unique_ptr count_re(strings_column_view const& strings, @@ -154,7 +169,16 @@ std::unique_ptr count_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_re(strings, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE); + return detail::count_re(strings, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr count_re(strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::count_re(strings, prog, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 76d2f84b1a0..5494fc9e265 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -86,13 +87,12 @@ struct extract_fn { // std::unique_ptr
extract(strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); @@ -136,7 +136,16 @@ std::unique_ptr
extract(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT); + return detail::extract(strings, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr
extract(strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract(strings, prog, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 76c2788c1be..cf94a41c2ea 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -97,16 +98,16 @@ struct extract_fn { */ std::unique_ptr extract_all_record( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const strings_count = input.size(); auto const d_strings = column_device_view::create(input.parent(), stream); - // Compile regex into device object. - auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); + // The extract pattern should always include groups. auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); @@ -171,7 +172,16 @@ std::unique_ptr extract_all_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_all_record(strings, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT); + return detail::extract_all_record(strings, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr extract_all_record(strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_all_record(strings, prog, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 5b86aedc409..0c0404f31ce 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -123,7 +123,7 @@ int32_t reprog::add_class(reclass const& cls) reinst& reprog::inst_at(int32_t id) { return _insts[id]; } -reclass& reprog::class_at(int32_t id) { return _classes[id]; } +reclass const& reprog::class_at(int32_t id) const { return _classes[id]; } void reprog::set_start_inst(int32_t id) { _startinst_id = id; } diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 7ad7f481436..b450b3f90e7 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -128,7 +128,7 @@ class reprog { [[nodiscard]] reinst const* insts_data() const; [[nodiscard]] int32_t classes_count() const; - [[nodiscard]] reclass& class_at(int32_t id); + [[nodiscard]] reclass const& class_at(int32_t id) const; [[nodiscard]] reclass const* classes_data() const; [[nodiscard]] const int32_t* starts_data() const; diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 98631680800..d16efb5f66e 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -25,6 +25,8 @@ #include #include +#include + #include namespace cudf { @@ -56,6 +58,8 @@ struct alignas(16) reclass_device { __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const; }; +class reprog; + /** * @brief Regex program of instructions/data for a specific regex pattern. * @@ -78,32 +82,14 @@ class reprog_device { reprog_device& operator=(reprog_device&&) = default; /** - * @brief Create device program instance from a regex pattern. - * - * The number of strings is needed to compute the state data size required when evaluating the - * regex. - * - * @param pattern The regex pattern to compile. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return The program device object. - */ - static std::unique_ptr> create( - std::string_view pattern, rmm::cuda_stream_view stream); - - /** - * @brief Create the device program instance from a regex pattern + * @brief Create device program instance from a regex program * - * @param pattern The regex pattern to compile - * @param re_flags Regex flags for interpreting special characters in the pattern - * @param capture Control how capture groups are processed + * @param prog The regex program to create from * @param stream CUDA stream used for device memory operations and kernel launches * @return The program device object */ static std::unique_ptr> create( - std::string_view pattern, - regex_flags const re_flags, - capture_groups const capture, - rmm::cuda_stream_view stream); + reprog const& prog, rmm::cuda_stream_view stream); /** * @brief Called automatically by the unique_ptr returned from create(). @@ -270,7 +256,7 @@ class reprog_device { cudf::size_type& end, cudf::size_type const group_id = 0) const; - reprog_device(reprog&); + reprog_device(reprog const&); int32_t _startinst_id; // first instruction id int32_t _num_capturing_groups; // instruction groups @@ -289,6 +275,16 @@ class reprog_device { int32_t _thread_count{}; // threads available in working memory }; +/** + * @brief Return the size in bytes needed for working memory to + * execute insts_count instructions in parallel over num_threads threads. + * + * @param num_threads Number of parallel threads (usually one per string in a strings column) + * @param insts_count Number of instructions from a compiled regex pattern + * @return Number of bytes needed for working memory + */ +std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count); + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp new file mode 100644 index 00000000000..0372f42c567 --- /dev/null +++ b/cpp/src/strings/regex/regex_program.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "regex_program_impl.h" + +#include + +#include +#include + +namespace cudf { +namespace strings { + +std::unique_ptr regex_program::create(std::string_view pattern, + regex_flags flags, + capture_groups capture) +{ + auto p = new regex_program(pattern, flags, capture); + return std::unique_ptr(p); +} + +regex_program::regex_program() = default; +regex_program::regex_program(regex_program&& other) = default; +regex_program& regex_program::operator=(regex_program&& other) = default; + +regex_program::regex_program(std::string_view pattern, regex_flags flags, capture_groups capture) + : _pattern(pattern), _flags(flags) +{ + auto p = new regex_program_impl{detail::reprog::create_from(pattern, flags, capture)}; + _impl = std::unique_ptr(p); +} + +std::string regex_program::pattern() const { return _pattern; } + +regex_flags regex_program::flags() const { return _flags; } + +capture_groups regex_program::capture() const { return _capture; } + +int32_t regex_program::instructions_count() const { return _impl->prog.insts_count(); } + +int32_t regex_program::groups_count() const { return _impl->prog.groups_count(); } + +std::size_t regex_program::compute_working_memory_size(int32_t num_threads) const +{ + return detail::compute_working_memory_size(instructions_count(), num_threads); +} + +regex_program::regex_program_impl* regex_program::get_impl() const { return _impl.get(); } + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h new file mode 100644 index 00000000000..46823c131b2 --- /dev/null +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "regcomp.h" +#include "regex.cuh" + +#include + +#include + +namespace cudf { +namespace strings { + +struct regex_program::regex_program_impl { + detail::reprog prog; + + auto create_prog_device(rmm::cuda_stream_view stream) + { + return detail::reprog_device::create(prog, stream); + } +}; + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cpp similarity index 90% rename from cpp/src/strings/regex/regexec.cu rename to cpp/src/strings/regex/regexec.cpp index 03247d24ba3..1c0a6869a2c 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cpp @@ -33,7 +33,7 @@ namespace strings { namespace detail { // Copy reprog primitive values -reprog_device::reprog_device(reprog& prog) +reprog_device::reprog_device(reprog const& prog) : _startinst_id{prog.get_start_inst()}, _num_capturing_groups{prog.groups_count()}, _insts_count{prog.insts_count()}, @@ -45,22 +45,8 @@ reprog_device::reprog_device(reprog& prog) } std::unique_ptr> reprog_device::create( - std::string_view pattern, rmm::cuda_stream_view stream) + reprog const& h_prog, rmm::cuda_stream_view stream) { - return reprog_device::create( - pattern, regex_flags::MULTILINE, capture_groups::NON_CAPTURE, stream); -} - -// Create instance of the reprog that can be passed into a device kernel -std::unique_ptr> reprog_device::create( - std::string_view pattern, - regex_flags const flags, - capture_groups const capture, - rmm::cuda_stream_view stream) -{ - // compile pattern into host object - reprog h_prog = reprog::create_from(pattern, flags, capture); - // compute size to hold all the member data auto const insts_count = h_prog.insts_count(); auto const classes_count = h_prog.classes_count(); @@ -144,7 +130,7 @@ void reprog_device::destroy() { delete this; } std::size_t reprog_device::working_memory_size(int32_t num_threads) const { - return relist::alloc_size(_insts_count, num_threads) * 2; + return compute_working_memory_size(insts_counts(), num_threads); } std::pair reprog_device::compute_strided_working_memory( @@ -176,6 +162,11 @@ int32_t reprog_device::compute_shared_memory_size() const return _prog_size < MAX_SHARED_MEM ? static_cast(_prog_size) : 0; } +std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count) +{ + return relist::alloc_size(insts_count, num_threads) * 2; +} + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index e0a995c26b9..8c38bd2a7a2 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -16,6 +16,7 @@ #include "backref_re.cuh" +#include #include #include @@ -102,19 +103,18 @@ std::pair> parse_backrefs(std::string_vie // std::unique_ptr replace_with_backrefs(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, std::string_view replacement, - regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(type_id::STRING); - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty"); CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty"); - // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); // parse the repl string for back-ref indicators auto group_count = std::min(99, d_prog->group_counts()); // group count should NOT exceed 99 @@ -152,8 +152,18 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& strings rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT); return detail::replace_with_backrefs( - strings, pattern, replacement, flags, cudf::default_stream_value, mr); + strings, *h_prog, replacement, cudf::default_stream_value, mr); +} + +std::unique_ptr replace_with_backrefs(strings_column_view const& strings, + regex_program const& prog, + std::string_view replacement, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::replace_with_backrefs(strings, prog, replacement, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index a5b9ad37e65..dd9bf2c5238 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -145,7 +146,8 @@ std::unique_ptr replace_re( patterns.size()); std::transform( patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) { - return reprog_device::create(ptn, flags, capture_groups::NON_CAPTURE, stream); + auto h_prog = regex_program::create(ptn, flags, capture_groups::NON_CAPTURE); + return h_prog->get_impl()->create_prog_device(stream); }); // get the longest regex for the dispatcher diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index fd0049d7c89..80a4759f59a 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -102,10 +103,9 @@ struct replace_regex_fn { // std::unique_ptr replace_re( strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, string_scalar const& replacement, std::optional max_replace_count, - regex_flags const flags, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -114,8 +114,8 @@ std::unique_ptr replace_re( CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid"); string_view d_repl(replacement.data(), replacement.size()); - // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); auto const maxrepl = max_replace_count.value_or(-1); @@ -143,8 +143,20 @@ std::unique_ptr replace_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE); return detail::replace_re( - strings, pattern, replacement, max_replace_count, flags, cudf::default_stream_value, mr); + strings, *h_prog, replacement, max_replace_count, cudf::default_stream_value, mr); +} + +std::unique_ptr replace_re(strings_column_view const& strings, + regex_program const& prog, + string_scalar const& replacement, + std::optional max_replace_count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::replace_re( + strings, prog, replacement, max_replace_count, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 73470bde867..b655d711df8 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -94,16 +95,15 @@ std::unique_ptr findall_util(column_device_view const& d_strings, // std::unique_ptr findall( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, + regex_program const& prog, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const strings_count = input.size(); auto const d_strings = column_device_view::create(input.parent(), stream); - // compile regex into device object - auto const d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); // Create lists offsets column auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); @@ -139,7 +139,16 @@ std::unique_ptr findall(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall(input, pattern, flags, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE); + return detail::findall(input, *h_prog, cudf::default_stream_value, mr); +} + +std::unique_ptr findall(strings_column_view const& input, + regex_program const& prog, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::findall(input, prog, cudf::default_stream_value, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index e8de1da0d83..2e286e70e7c 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -184,13 +185,13 @@ struct tokens_transform_fn { }; std::unique_ptr
split_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, split_direction direction, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty"); auto const strings_count = input.size(); @@ -200,8 +201,9 @@ std::unique_ptr
split_re(strings_column_view const& input, return std::make_unique
(std::move(results)); } - // create the regex device prog from the given pattern - auto d_prog = reprog_device::create(pattern, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string @@ -252,18 +254,19 @@ std::unique_ptr
split_re(strings_column_view const& input, } std::unique_ptr split_record_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, split_direction direction, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty"); auto const strings_count = input.size(); - // create the regex device prog from the given pattern - auto d_prog = reprog_device::create(pattern, stream); + // create device object from regex_program + auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string @@ -289,39 +292,39 @@ std::unique_ptr split_record_re(strings_column_view const& input, } // namespace std::unique_ptr
split_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); + return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr); } std::unique_ptr split_record_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); + return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr); } std::unique_ptr
rsplit_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); + return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr); } std::unique_ptr rsplit_record_re(strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); + return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr); } } // namespace detail @@ -334,7 +337,17 @@ std::unique_ptr
split_re(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_re(input, pattern, maxsplit, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern); + return detail::split_re(input, *h_prog, maxsplit, cudf::default_stream_value, mr); +} + +std::unique_ptr
split_re(strings_column_view const& input, + regex_program const& prog, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_re(input, prog, maxsplit, cudf::default_stream_value, mr); } std::unique_ptr split_record_re(strings_column_view const& input, @@ -343,7 +356,17 @@ std::unique_ptr split_record_re(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record_re(input, pattern, maxsplit, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern); + return detail::split_record_re(input, *h_prog, maxsplit, cudf::default_stream_value, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + regex_program const& prog, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_record_re(input, prog, maxsplit, cudf::default_stream_value, mr); } std::unique_ptr
rsplit_re(strings_column_view const& input, @@ -352,7 +375,17 @@ std::unique_ptr
rsplit_re(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_re(input, pattern, maxsplit, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern); + return detail::rsplit_re(input, *h_prog, maxsplit, cudf::default_stream_value, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + regex_program const& prog, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_re(input, prog, maxsplit, cudf::default_stream_value, mr); } std::unique_ptr rsplit_record_re(strings_column_view const& input, @@ -361,7 +394,18 @@ std::unique_ptr rsplit_record_re(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_record_re(input, pattern, maxsplit, cudf::default_stream_value, mr); + auto const h_prog = regex_program::create(pattern); + return detail::rsplit_record_re(input, *h_prog, maxsplit, cudf::default_stream_value, mr); } + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + regex_program const& prog, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_record_re(input, prog, maxsplit, cudf::default_stream_value, mr); +} + } // namespace strings } // namespace cudf From f6c8b6b9393fe0bed60a36ec9c317d7fa88400aa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 14 Oct 2022 17:21:58 -0400 Subject: [PATCH 02/11] fix missing doxygen --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/strings/regex/regex_program.hpp | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ccb0d685062..8955fed47fb 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -233,6 +233,7 @@ outputs: - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/regex/flags.hpp + - test -f $PREFIX/include/cudf/strings/regex/regex_program.hpp - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index cfe1f07e062..a75ab724c9c 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -52,7 +52,19 @@ struct regex_program { regex_flags flags = regex_flags::DEFAULT, capture_groups capture = capture_groups::EXTRACT); + /** + * @brief Move constructor + * + * @param other Object to move from + */ regex_program(regex_program&& other); + + /** + * @brief Move operator assignment + * + * @param other Object to move from + * @return this object + */ regex_program& operator=(regex_program&& other); /** @@ -100,6 +112,7 @@ struct regex_program { /** * @brief Return the pattern used to create this instance * + * @param num_threads Number of parallel threads for computation * @return regex pattern as a string */ std::size_t compute_working_memory_size(int32_t num_threads) const; From 87783ac9a8c3bcb44a33022324f4a7b040515d0a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 17 Oct 2022 15:56:47 -0400 Subject: [PATCH 03/11] add regex_program dtor decl/def --- cpp/include/cudf/strings/regex/regex_program.hpp | 10 +++++++--- cpp/src/strings/regex/regex_program.cpp | 1 + cpp/src/strings/regex/regex_program_impl.h | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index a75ab724c9c..e0c93543cc1 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -30,9 +30,9 @@ namespace strings { */ /** - * @brief Regex program class. + * @brief Regex program class * - * Create an instance from a regex pattern and use it to call + * Create an instance from a regex pattern and use it to call the appropriate * strings APIs. An instance can be reused. * * See the @ref md_regex "Regex Features" page for details on patterns and APIs the support regex. @@ -43,9 +43,11 @@ struct regex_program { /** * @brief Create a program from a pattern * + * @throw cudf::logic_error If pattern is found to be invalid or contain unsupported features + * * @param pattern Regex pattern * @param flags Regex flags for interpreting special characters in the pattern - * @param capture Control how capture groups in the pattern are used + * @param capture Controls how capture groups in the pattern are used * @return Instance of this object */ static std::unique_ptr create(std::string_view pattern, @@ -117,6 +119,8 @@ struct regex_program { */ std::size_t compute_working_memory_size(int32_t num_threads) const; + ~regex_program(); + private: regex_program(); diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp index 0372f42c567..493ee6a9f69 100644 --- a/cpp/src/strings/regex/regex_program.cpp +++ b/cpp/src/strings/regex/regex_program.cpp @@ -33,6 +33,7 @@ std::unique_ptr regex_program::create(std::string_view pattern, } regex_program::regex_program() = default; +regex_program::~regex_program() = default; regex_program::regex_program(regex_program&& other) = default; regex_program& regex_program::operator=(regex_program&& other) = default; diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h index 46823c131b2..5c20d6e982f 100644 --- a/cpp/src/strings/regex/regex_program_impl.h +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -24,9 +24,19 @@ namespace cudf { namespace strings { +/** + * @brief Implementation object for regex_program + * + * It encapsulates internal reprog object used for building its device equivalent + */ struct regex_program::regex_program_impl { detail::reprog prog; + /** + * @brief Return device instance of reprog object + * + * @param stream CUDA stream to use for device memory allocations and copies + */ auto create_prog_device(rmm::cuda_stream_view stream) { return detail::reprog_device::create(prog, stream); From 1ebfc3bdda4461e85929d9e461b865fe7e883b9f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 17 Oct 2022 17:44:02 -0400 Subject: [PATCH 04/11] fix doxygen for compute-working-memory-size function --- cpp/include/cudf/strings/regex/regex_program.hpp | 6 +++--- cpp/src/strings/regex/regex_program.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index e0c93543cc1..da3ec87ffc4 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -114,10 +114,10 @@ struct regex_program { /** * @brief Return the pattern used to create this instance * - * @param num_threads Number of parallel threads for computation - * @return regex pattern as a string + * @param num_strings Number of strings for computation + * @return Size of the working memory in bytes */ - std::size_t compute_working_memory_size(int32_t num_threads) const; + std::size_t compute_working_memory_size(int32_t num_strings) const; ~regex_program(); diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp index 493ee6a9f69..4a382caaa5a 100644 --- a/cpp/src/strings/regex/regex_program.cpp +++ b/cpp/src/strings/regex/regex_program.cpp @@ -54,9 +54,9 @@ int32_t regex_program::instructions_count() const { return _impl->prog.insts_cou int32_t regex_program::groups_count() const { return _impl->prog.groups_count(); } -std::size_t regex_program::compute_working_memory_size(int32_t num_threads) const +std::size_t regex_program::compute_working_memory_size(int32_t num_strings) const { - return detail::compute_working_memory_size(instructions_count(), num_threads); + return detail::compute_working_memory_size(instructions_count(), num_strings); } regex_program::regex_program_impl* regex_program::get_impl() const { return _impl.get(); } From 52d86fd4b002c32cc523994339f6f39d640fa3d9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 19 Oct 2022 17:13:26 -0400 Subject: [PATCH 05/11] add gtests with regex_program --- cpp/tests/strings/contains_tests.cpp | 375 ++++++++++++++++------ cpp/tests/strings/extract_tests.cpp | 66 +++- cpp/tests/strings/findall_tests.cpp | 20 +- cpp/tests/strings/replace_regex_tests.cpp | 240 +++++++++----- 4 files changed, 500 insertions(+), 201 deletions(-) diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index ba738f7b616..1b6f7ba3618 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,13 +14,15 @@ * limitations under the License. */ -#include -#include -#include - #include #include #include +#include + +#include +#include +#include +#include #include #include @@ -147,6 +149,9 @@ TEST_F(StringsContainsTests, ContainsTest) h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(ptn); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -161,40 +166,56 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::matches_re(strings_view, "lazy"); - bool h_expected[] = {false, false, true, false, false, false, false}; + auto const pattern = std::string("lazy"); + auto results = cudf::strings::matches_re(strings_view, pattern); + bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, "\\d+"); - bool h_expected[] = {false, false, false, true, true, false, false}; + auto const pattern = std::string("\\d+"); + auto results = cudf::strings::matches_re(strings_view, pattern); + bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, "@\\w+"); - bool h_expected[] = {false, false, false, false, false, false, false}; + auto const pattern = std::string("@\\w+"); + auto results = cudf::strings::matches_re(strings_view, pattern); + bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, ".*"); - bool h_expected[] = {true, true, true, true, true, false, true}; + auto const pattern = std::string(".*"); + auto results = cudf::strings::matches_re(strings_view, pattern); + bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -219,6 +240,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) cudf::test::fixed_width_column_wrapper expected( {true, true, false, false, false, false, true, true, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } { // is_loopback: 72 instructions std::string pattern = @@ -229,6 +253,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) cudf::test::fixed_width_column_wrapper expected( {false, false, false, false, false, false, false, false, false, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } { // is_multicast: 79 instructions std::string pattern = @@ -239,6 +266,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) cudf::test::fixed_width_column_wrapper expected( {false, false, false, false, false, false, true, true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::matches_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } @@ -247,18 +277,43 @@ TEST_F(StringsContainsTests, OctalTest) cudf::test::strings_column_wrapper strings({"A3", "B", "CDA3EY", "", "99", "\a\t\r"}); auto strings_view = cudf::strings_column_view(strings); auto expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 0, 0}); - auto results = cudf::strings::contains_re(strings_view, "\\101"); + + auto pattern = std::string("\\101"); + auto results = cudf::strings::contains_re(strings_view, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "\\1013"); + + pattern = std::string("\\1013"); + results = cudf::strings::contains_re(strings_view, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("D*\\101\\063"); + results = cudf::strings::contains_re(strings_view, pattern); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "D*\\101\\063"); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "\\719"); + + pattern = std::string("\\719"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "[\\7][\\11][\\15]"); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("[\\7][\\11][\\15]"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsContainsTests, HexTest) @@ -285,10 +340,17 @@ TEST_F(StringsContainsTests, HexTest) 0, [ch](auto idx) { return ch == static_cast(idx); }); cudf::test::fixed_width_column_wrapper expected(true_dat, true_dat + count); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // also test hex character appearing in character class brackets pattern = "[" + pattern + "]"; results = cudf::strings::contains_re(strings_view, pattern); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -303,36 +365,56 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) cudf::test::strings_column_wrapper input(data.begin(), data.end()); auto strings_view = cudf::strings_column_view(input); - auto results = cudf::strings::contains_re(strings_view, "A"); + auto pattern = std::string("A"); + auto results = cudf::strings::contains_re(strings_view, pattern); auto expected = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "B"); + pattern = std::string("B"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "J\\0B"); + pattern = std::string("J\\0B"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "[G-J][\\0]B"); + pattern = std::string("[G-J][\\0]B"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(strings_view, "[A-D][\\x00]B"); + pattern = std::string("[A-D][\\x00]B"); + results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsContainsTests, Errors) { - cudf::test::strings_column_wrapper input({"3", "33"}); - auto strings_view = cudf::strings_column_view(input); + EXPECT_THROW(cudf::strings::regex_program::create("(3?)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("(?:3?)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("3?+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("{3}a"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(strings_view, "(?:3?)+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(strings_view, "{3}a"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("aaaa{1234,5678}"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("aaaa{123,5678}"), cudf::logic_error); } TEST_F(StringsContainsTests, CountTest) @@ -340,36 +422,37 @@ TEST_F(StringsContainsTests, CountTest) std::vector h_strings{ "The quick brown @fox jumps ovér the", "lazy @dog", "1:2:3:4", "00:0:00", nullptr, ""}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::count_re(strings_view, "[tT]he"); - int32_t h_expected[] = {2, 0, 0, 0, 0, 0}; + auto pattern = std::string("[tT]he"); + auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + {2, 0, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, "@\\w+"); - int32_t h_expected[] = {1, 1, 0, 0, 0, 0}; + auto pattern = std::string("@\\w+"); + auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + {1, 1, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, "\\d+:\\d+"); - int32_t h_expected[] = {0, 0, 2, 1, 0, 0}; + auto pattern = std::string("\\d+:\\d+"); + auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + {0, 0, 2, 1, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -381,67 +464,90 @@ TEST_F(StringsContainsTests, FixedQuantifier) { // exact match - auto results = cudf::strings::count_re(sv, "a{3}"); + auto pattern = std::string("a{3}"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 2}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // range match (greedy quantifier) - auto results = cudf::strings::count_re(sv, "a{3,5}"); + auto pattern = std::string("a{3,5}"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // minimum match (greedy quantifier) - auto results = cudf::strings::count_re(sv, "a{2,}"); + auto pattern = std::string("a{2,}"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // range match (lazy quantifier) - auto results = cudf::strings::count_re(sv, "a{2,4}?"); + auto pattern = std::string("a{2,4}?"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 1, 2, 2, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // minimum match (lazy quantifier) - auto results = cudf::strings::count_re(sv, "a{1,}?"); + auto pattern = std::string("a{1,}?"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({1, 2, 3, 4, 5, 6}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // zero match - auto results = cudf::strings::count_re(sv, "aaaa{0}"); + auto pattern = std::string("aaaa{0}"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 2}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // poorly formed - auto results = cudf::strings::count_re(sv, "aaaa{n,m}"); + auto pattern = std::string("aaaa{n,m}"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{1234,5678}"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{123,5678}"), cudf::logic_error); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } TEST_F(StringsContainsTests, QuantifierErrors) { - auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"}); - auto sv = cudf::strings_column_view(input); - - EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error); - - EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error); - EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+")); - EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+")); + EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("$+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("(^)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("($)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("\\A+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("\\Z+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("(\\A)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("(\\Z)+"), cudf::logic_error); + + EXPECT_THROW(cudf::strings::regex_program::create("(^($))+"), cudf::logic_error); + EXPECT_NO_THROW(cudf::strings::regex_program::create("(^a($))+")); + EXPECT_NO_THROW(cudf::strings::regex_program::create("(^(a$))+")); } TEST_F(StringsContainsTests, OverlappedClasses) @@ -450,14 +556,22 @@ TEST_F(StringsContainsTests, OverlappedClasses) auto sv = cudf::strings_column_view(input); { - auto results = cudf::strings::count_re(sv, "[e-gb-da-c]"); + auto pattern = std::string("[e-gb-da-c]"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({7, 4, 0, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::count_re(sv, "[á-éê-ú]"); + auto pattern = std::string("[á-éê-ú]"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 6, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -467,14 +581,22 @@ TEST_F(StringsContainsTests, NegatedClasses) auto sv = cudf::strings_column_view(input); { - auto results = cudf::strings::count_re(sv, "[^a-f]"); + auto pattern = std::string("[^a-f]"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({1, 4, 0, 5, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::count_re(sv, "[^a-eá-é]"); + auto pattern = std::string("[^a-eá-é]"); + auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({2, 5, 0, 1, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -487,14 +609,18 @@ TEST_F(StringsContainsTests, IncompleteClassesRange) cudf::test::fixed_width_column_wrapper expected({1, 0, 0, 1, 1}); auto results = cudf::strings::contains_re(sv, "[a-z]"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(sv, "[a-m-z]"); // same as [a-z] + + auto prog = cudf::strings::regex_program::create("[a-m-z]"); // same as [a-z] + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::fixed_width_column_wrapper expected({1, 1, 0, 1, 1}); auto results = cudf::strings::contains_re(sv, "[g-]"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(sv, "[-k]"); + + auto prog = cudf::strings::regex_program::create("[-k]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -503,9 +629,12 @@ TEST_F(StringsContainsTests, IncompleteClassesRange) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::contains_re(sv, "[+--]"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(sv, "[a-c-]"); + + auto prog = cudf::strings::regex_program::create("[a-c-]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(sv, "[-d-f]"); + prog = cudf::strings::regex_program::create("[-d-f]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -516,26 +645,43 @@ TEST_F(StringsContainsTests, MultiLine) cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); + auto pattern = std::string("^abc$"); + auto prog = cudf::strings::regex_program::create(pattern); + auto prog_ml = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); + + auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, "^abc$"); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + results = cudf::strings::contains_re(view, pattern); expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); + results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, "^abc$"); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + results = cudf::strings::matches_re(view, pattern); expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); + results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, "^abc$"); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + results = cudf::strings::count_re(view, pattern); expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } TEST_F(StringsContainsTests, DotAll) @@ -543,31 +689,55 @@ TEST_F(StringsContainsTests, DotAll) auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::contains_re(view, "a.*f", cudf::strings::regex_flags::DOTALL); + auto pattern = std::string("a.*f"); + auto prog = cudf::strings::regex_program::create(pattern); + auto prog_dotall = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); + + auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, "a.*f"); + results = cudf::strings::contains_re(view, *prog_dotall); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + results = cudf::strings::contains_re(view, pattern); expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::matches_re(view, "a.*f", cudf::strings::regex_flags::DOTALL); + results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, "a.*f"); + results = cudf::strings::matches_re(view, *prog_dotall); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + results = cudf::strings::matches_re(view, pattern); expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + + pattern = std::string("a.*?f"); + prog = cudf::strings::regex_program::create(pattern); + prog_dotall = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); - results = cudf::strings::count_re(view, "a.*?f", cudf::strings::regex_flags::DOTALL); + results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, "a.*?f"); + results = cudf::strings::count_re(view, *prog_dotall); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + results = cudf::strings::count_re(view, pattern); expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - auto both_flags = cudf::strings::regex_flags::DOTALL | cudf::strings::regex_flags::MULTILINE; - results = - cudf::strings::count_re(view, "a.*?f", static_cast(both_flags)); - expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); + auto both_flags = static_cast(cudf::strings::regex_flags::DOTALL | + cudf::strings::regex_flags::MULTILINE); + results = cudf::strings::count_re(view, pattern, both_flags); + expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + auto prog_both = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::count_re(view, *prog_both); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } @@ -586,9 +756,16 @@ TEST_F(StringsContainsTests, ASCII) auto results = cudf::strings::contains_re(view, ptn, cudf::strings::regex_flags::ASCII); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + auto prog = cudf::strings::regex_program::create(ptn, cudf::strings::regex_flags::ASCII); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + results = cudf::strings::contains_re(view, ptn); expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + prog = cudf::strings::regex_program::create(ptn); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); } } diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index e396ca42d6c..62d7ef2a418 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -76,6 +77,10 @@ TEST_F(StringsExtractTests, ExtractTest) columns.push_back(expected2.release()); cudf::table expected(std::move(columns)); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(strings_view, pattern); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } TEST_F(StringsExtractTests, ExtractDomainTest) @@ -117,6 +122,10 @@ TEST_F(StringsExtractTests, ExtractDomainTest) }); cudf::table_view expected{{expected1}}; CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(strings_view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } TEST_F(StringsExtractTests, ExtractEventTest) @@ -144,9 +153,13 @@ TEST_F(StringsExtractTests, ExtractEventTest) "Test Message Description"}); for (std::size_t idx = 0; idx < patterns.size(); ++idx) { - auto results = cudf::strings::extract(strings_view, patterns[idx]); + auto pattern = patterns[idx]; + auto results = cudf::strings::extract(strings_view, pattern); cudf::test::strings_column_wrapper expected({expecteds[idx]}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(strings_view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected); } } @@ -156,15 +169,24 @@ TEST_F(StringsExtractTests, MultiLine) cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE); + auto pattern = std::string("(^[a-c]+$)"); + auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::MULTILINE); cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"}, {1, 1, 1, 0, 1}); auto expected = cudf::table_view{{expected_multiline}}; CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - results = cudf::strings::extract(view, "^([a-c]+)$"); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + + pattern = std::string("^([a-c]+)$"); + results = cudf::strings::extract(view, pattern); cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0}); expected = cudf::table_view{{expected_default}}; CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } TEST_F(StringsExtractTests, DotAll) @@ -172,15 +194,23 @@ TEST_F(StringsExtractTests, DotAll) auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL); + auto pattern = std::string("(a.*f)"); + auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::DOTALL); cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""}, {1, 1, 1, 0}); auto expected = cudf::table_view{{expected_dotall}}; CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - results = cudf::strings::extract(view, "(a.*f)"); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + + results = cudf::strings::extract(view, pattern); cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0}); expected = cudf::table_view{{expected_default}}; CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } TEST_F(StringsExtractTests, EmptyExtractTest) @@ -192,7 +222,8 @@ TEST_F(StringsExtractTests, EmptyExtractTest) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::extract(strings_view, "([^_]*)\\Z"); + auto pattern = std::string("([^_]*)\\Z"); + auto results = cudf::strings::extract(strings_view, pattern); std::vector h_expected{nullptr, "AAA", "A", "", "", ""}; cudf::test::strings_column_wrapper expected( @@ -203,6 +234,9 @@ TEST_F(StringsExtractTests, EmptyExtractTest) columns.push_back(expected.release()); cudf::table table_expected(std::move(columns)); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(strings_view, *prog); + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); } TEST_F(StringsExtractTests, ExtractAllTest) @@ -214,7 +248,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity); auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)"); + auto pattern = std::string("(\\d+) (\\w+)"); + auto results = cudf::strings::extract_all_record(sv, pattern); bool valids[] = {true, true, true, false, false, false, true}; using LCW = cudf::test::lists_column_wrapper; @@ -226,15 +261,24 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{"4", "pare"}}, valids); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract_all_record(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } TEST_F(StringsExtractTests, Errors) { cudf::test::strings_column_wrapper input({"this column intentionally left blank"}); auto sv = cudf::strings_column_view(input); - EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error); + + auto pattern = std::string("\\w+"); + auto prog = cudf::strings::regex_program::create(pattern); + + EXPECT_THROW(cudf::strings::extract(sv, pattern), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract(sv, *prog), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all_record(sv, pattern), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all_record(sv, *prog), cudf::logic_error); } TEST_F(StringsExtractTests, MediumRegex) diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 1dd088cb70f..6428be28e0a 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -35,8 +36,10 @@ TEST_F(StringsFindallTests, FindallTest) cudf::test::strings_column_wrapper input( {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, valids); + auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::findall(cudf::strings_column_view(input), "(\\d+)-(\\w+)"); + auto pattern = std::string("(\\d+)-(\\w+)"); + auto results = cudf::strings::findall(sv, pattern); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"3-A"}, @@ -49,6 +52,9 @@ TEST_F(StringsFindallTests, FindallTest) LCW{"25-9000"}}, valids); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::findall(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } TEST_F(StringsFindallTests, Multiline) @@ -56,10 +62,14 @@ TEST_F(StringsFindallTests, Multiline) cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::findall(view, "(^abc$)", cudf::strings::regex_flags::MULTILINE); + auto pattern = std::string("(^abc$)"); + auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::MULTILINE); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"abc", "abc"}, LCW{"abc"}, LCW{"abc"}, LCW{}, LCW{"abc"}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); + results = cudf::strings::findall(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } TEST_F(StringsFindallTests, DotAll) @@ -67,10 +77,14 @@ TEST_F(StringsFindallTests, DotAll) cudf::test::strings_column_wrapper input({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""}); auto view = cudf::strings_column_view(input); - auto results = cudf::strings::findall(view, "(b.*f)", cudf::strings::regex_flags::DOTALL); + auto pattern = std::string("(b.*f)"); + auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::DOTALL); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdef"}, LCW{}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); + results = cudf::strings::findall(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } TEST_F(StringsFindallTests, MediumRegex) diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 6280463d112..840d998e56c 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -17,7 +17,9 @@ #include #include #include +#include +#include #include #include @@ -39,9 +41,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest) nullptr}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); auto strings_view = cudf::strings_column_view(strings); std::vector h_expected{"= quick brown fox jumps over = lazy dog", @@ -52,13 +52,15 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest) "", nullptr}; - std::string pattern = "(\\bthe\\b)"; - auto results = cudf::strings::replace_re(strings_view, pattern, cudf::string_scalar("=")); + auto pattern = std::string("(\\bthe\\b)"); + auto repl = cudf::string_scalar("="); + auto results = cudf::strings::replace_re(strings_view, pattern, repl); cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(strings_view, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest) @@ -72,9 +74,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest) nullptr}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); auto strings_view = cudf::strings_column_view(strings); std::vector h_expected{" quick brown fox jumps over lazy dog", @@ -91,101 +91,132 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest) auto repls_view = cudf::strings_column_view(repls); auto results = cudf::strings::replace_re(strings_view, patterns, repls_view); cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, InvalidRegex) { - cudf::test::strings_column_wrapper strings( - {"abc*def|ghi+jkl", ""}); // these do not really matter - auto strings_view = cudf::strings_column_view(strings); - // these are quantifiers that do not have a preceding character/class - EXPECT_THROW(cudf::strings::replace_re(strings_view, "*", cudf::string_scalar("")), - cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_re(strings_view, "|", cudf::string_scalar("")), - cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_re(strings_view, "+", cudf::string_scalar("")), - cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_re(strings_view, "ab(*)", cudf::string_scalar("")), - cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\", cudf::string_scalar("")), - cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\p", cudf::string_scalar("")), - cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("*"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("|"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("ab(*)"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("\\"), cudf::logic_error); + EXPECT_THROW(cudf::strings::regex_program::create("\\p"), cudf::logic_error); } TEST_F(StringsReplaceRegexTest, WithEmptyPattern) { std::vector h_strings{"asd", "xcv"}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); auto strings_view = cudf::strings_column_view(strings); - std::vector patterns({""}); + + auto empty_pattern = std::string(""); + auto repl = cudf::string_scalar("bbb"); + std::vector patterns({empty_pattern}); cudf::test::strings_column_wrapper repls({"bbb"}); auto repls_view = cudf::strings_column_view(repls); auto results = cudf::strings::replace_re(strings_view, patterns, repls_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); - results = cudf::strings::replace_re(strings_view, "", cudf::string_scalar("bbb")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); + results = cudf::strings::replace_re(strings_view, "", repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); + auto prog = cudf::strings::regex_program::create(empty_pattern); + results = cudf::strings::replace_re(strings_view, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); } TEST_F(StringsReplaceRegexTest, MultiReplacement) { cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"}); - auto results = - cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("aba"); + auto repl = cudf::string_scalar("_"); + auto results = cudf::strings::replace_re(sv, pattern, repl, 2); cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = - cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl, 2); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::replace_re(sv, pattern, repl, 0); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); + results = cudf::strings::replace_re(sv, *prog, repl, 0); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); } TEST_F(StringsReplaceRegexTest, WordBoundary) { cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é", "_", "a_b"}); - auto results = - cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X")); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("\\b"); + auto repl = cudf::string_scalar("X"); + auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper( {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX", "X_X", "Xa_bX"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = - cudf::strings::replace_re(cudf::strings_column_view(input), "\\B", cudf::string_scalar("X")); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + pattern = std::string("\\B"); + results = cudf::strings::replace_re(sv, pattern, repl); expected = cudf::test::strings_column_wrapper( {"aXbXa bXcXd\naXbXa", "zXéXz", "AX1XBX2-éX3", "e é", "_", "aX_Xb"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, Alternation) { cudf::test::strings_column_wrapper input( {"16 6 brr 232323 1 hello 90", "123 ABC 00 2022", "abé123 4567 89xyz"}); - auto results = cudf::strings::replace_re( - cudf::strings_column_view(input), "(^|\\s)\\d+(\\s|$)", cudf::string_scalar("_")); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("(^|\\s)\\d+(\\s|$)"); + auto repl = cudf::string_scalar("_"); + auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = cudf::strings::replace_re( - cudf::strings_column_view(input), "(\\s|^)\\d+($|\\s)", cudf::string_scalar("_")); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + pattern = std::string("(\\s|^)\\d+($|\\s)"); + results = cudf::strings::replace_re(sv, pattern, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) { cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("D*"); auto repl = cudf::string_scalar("_"); - auto results = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl); + auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + pattern = std::string("D?s?"); + results = cudf::strings::replace_re(sv, pattern, repl); expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, Multiline) @@ -196,14 +227,21 @@ TEST_F(StringsReplaceRegexTest, Multiline) auto sv = cudf::strings_column_view(input); // single-replace - auto results = - cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline); + auto pattern = std::string("^aba$"); + auto repl = cudf::string_scalar("_"); + auto results = cudf::strings::replace_re(sv, pattern, repl, std::nullopt, multiline); cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml); + auto prog = cudf::strings::regex_program::create(pattern, multiline); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml); - results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_")); + results = cudf::strings::replace_re(sv, pattern, repl); cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); // multi-replace std::vector patterns({"aba$", "^aba"}); @@ -217,15 +255,23 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected); // backref-replace - results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline); + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^aba)"); + results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template, multiline); cudf::test::strings_column_wrapper br_expected_ml( {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml); + prog = cudf::strings::regex_program::create(pattern, multiline); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml); - results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]"); + results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper br_expected( {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) @@ -239,10 +285,8 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) nullptr}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - auto strings_view = cudf::strings_column_view(strings); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); + auto sv = cudf::strings_column_view(strings); std::vector h_expected{"the-quick-brown-fox-jumps-over-the-lazy-dog", "the-fat-cat-lays-next-to-the-other-accénted-cat", @@ -252,38 +296,43 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) "", nullptr}; - std::string pattern = "(\\w) (\\w)"; - std::string repl_template = "\\1-\\2"; - auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + auto pattern = std::string("(\\w) (\\w)"); + auto repl_template = std::string("\\1-\\2"); + auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest) { - cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"}); - auto strings_view = cudf::strings_column_view(strings); + cudf::test::strings_column_wrapper input({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"}); + auto sv = cudf::strings_column_view(input); - std::string pattern = "(\\d+)-(\\d+)"; - std::string repl_template = "${2} X ${1}0"; - auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + auto pattern = std::string("(\\d+)-(\\d+)"); + auto repl_template = std::string("${2} X ${1}0"); + auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest) { cudf::test::strings_column_wrapper strings( {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"}); - auto strings_view = cudf::strings_column_view(strings); - std::string pattern = "([a-z])-([a-zé])"; - std::string repl_template = "X\\2+\\1Z"; - auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + auto sv = cudf::strings_column_view(strings); + + auto pattern = std::string("([a-z])-([a-zé])"); + auto repl_template = std::string("X\\2+\\1Z"); + auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected({"A543", "Z756", @@ -293,33 +342,45 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest) "abcXé+dZfgh", "tésXs+tZtrinXa+gZgain"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier) { cudf::test::strings_column_wrapper input( {"

title

ABC

", "

1234567

XYZ

"}); - std::string replacement = "

\\1

\\2

"; + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("

(.*)

(.*)

"); + auto repl_template = std::string("

\\1

\\2

"); - auto results = cudf::strings::replace_with_backrefs( - cudf::strings_column_view(input), "

(.*)

(.*)

", replacement); + auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( {"

title

ABC

", "

1234567

XYZ

"}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = cudf::strings::replace_with_backrefs( - cudf::strings_column_view(input), "

([a-z\\d]+)

([A-Z]+)

", replacement); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + pattern = std::string("

([a-z\\d]+)

([A-Z]+)

"); + results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) { cudf::test::strings_column_wrapper strings( {"TEST123", "TEST1TEST2", "TEST2-TEST1122", "TEST1-TEST-T", "TES3"}); - auto strings_view = cudf::strings_column_view(strings); - std::string pattern = "(TEST)(\\d+)"; - std::string repl_template = "${0}: ${1}, ${2}; "; - auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + auto sv = cudf::strings_column_view(strings); + + auto pattern = std::string("(TEST)(\\d+)"); + auto repl_template = std::string("${0}: ${1}, ${2}; "); + auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected({ "TEST123: TEST, 123; ", @@ -329,6 +390,9 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) "TES3", }); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) From bd9cd9cf60a162724d11bf84ef54eb099d72755c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 20 Oct 2022 16:21:55 -0400 Subject: [PATCH 06/11] add gtests for split_re functions --- cpp/tests/strings/split_tests.cpp | 110 ++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 27 deletions(-) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index c7bbce263f3..73d5adab427 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -14,20 +14,21 @@ * limitations under the License. */ +#include +#include +#include +#include +#include + #include #include +#include #include #include #include #include #include -#include -#include -#include -#include -#include - #include #include @@ -316,21 +317,28 @@ TEST_F(StringsSplitTest, SplitRegex) auto sv = cudf::strings_column_view(input); { - auto result = cudf::strings::split_re(sv, "\\s+"); + auto pattern = std::string("\\s+"); + auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); auto expected = cudf::table_view({col0, col1, col2}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_re(sv, *prog); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_re(sv, "\\s+"); + result = cudf::strings::rsplit_re(sv, pattern); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { - auto result = cudf::strings::split_re(sv, "[eé]"); + auto pattern = std::string("[eé]"); + auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, @@ -339,9 +347,14 @@ TEST_F(StringsSplitTest, SplitRegex) cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); auto expected = cudf::table_view({col0, col1, col2, col3}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_re(sv, *prog); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_re(sv, "[eé]"); + result = cudf::strings::rsplit_re(sv, pattern); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } } @@ -356,20 +369,27 @@ TEST_F(StringsSplitTest, SplitRecordRegex) using LCW = cudf::test::lists_column_wrapper; { - auto result = cudf::strings::split_record_re(sv, "\\s+"); + auto pattern = std::string("\\s+"); + auto result = cudf::strings::split_record_re(sv, pattern); LCW expected( {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_record_re(sv, "\\s+"); + result = cudf::strings::rsplit_record_re(sv, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } { - auto result = cudf::strings::split_record_re(sv, "[eé]"); + auto pattern = std::string("[eé]"); + auto result = cudf::strings::split_record_re(sv, pattern); LCW expected({LCW{" H", "llo th", "s", ""}, LCW{}, @@ -378,9 +398,14 @@ TEST_F(StringsSplitTest, SplitRecordRegex) LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_record_re(sv, "[eé]"); + result = cudf::strings::rsplit_record_re(sv, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } } @@ -393,37 +418,51 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); { - auto result = cudf::strings::split_re(sv, "\\s+", 1); + auto pattern = std::string("\\s+"); + auto result = cudf::strings::split_re(sv, pattern, 1); cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_re(sv, *prog, 1); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // split everything is the same output as maxsplit==2 for the test input column here - result = cudf::strings::split_re(sv, "\\s+", 2); - auto expected2 = cudf::strings::split_re(sv, "\\s+"); + result = cudf::strings::split_re(sv, pattern, 2); + auto expected2 = cudf::strings::split_re(sv, pattern); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); + result = cudf::strings::split_re(sv, *prog, 3); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); } { - auto result = cudf::strings::split_record_re(sv, "\\s", 1); + auto pattern = std::string("\\s"); + auto result = cudf::strings::split_record_re(sv, pattern, 1); using LCW = cudf::test::lists_column_wrapper; LCW expected1( {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_record_re(sv, *prog, 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); - result = cudf::strings::split_record_re(sv, "\\s", 2); + result = cudf::strings::split_record_re(sv, pattern, 2); LCW expected2( {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); + result = cudf::strings::split_record_re(sv, *prog, 2); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); // split everything is the same output as maxsplit==3 for the test input column here - result = cudf::strings::split_record_re(sv, "\\s", 3); - auto expected0 = cudf::strings::split_record_re(sv, "\\s"); + result = cudf::strings::split_record_re(sv, pattern, 3); + auto expected0 = cudf::strings::split_record_re(sv, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + result = cudf::strings::split_record_re(sv, *prog, 3); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } @@ -433,7 +472,8 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) cudf::test::strings_column_wrapper input({"a", "ab", "-+", "e\né"}); auto sv = cudf::strings_column_view(input); { - auto result = cudf::strings::split_re(sv, "\\b"); + auto pattern = std::string("\\b"); + auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({"", "", "-+", ""}); cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {1, 1, 0, 1}); @@ -442,13 +482,20 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) cudf::test::strings_column_wrapper col4({"", "", "", ""}, {0, 0, 0, 1}); auto expected = cudf::table_view({col0, col1, col2, col3, col4}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_re(sv, *prog); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { - auto result = cudf::strings::split_record_re(sv, "\\B"); + auto pattern = std::string("\\B"); + auto result = cudf::strings::split_record_re(sv, pattern); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"a"}, LCW{"a", "b"}, LCW{"", "-", "+", ""}, LCW{"e\né"}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + auto prog = cudf::strings::regex_program::create(pattern); + result = cudf::strings::split_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } } @@ -551,26 +598,35 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); + auto pattern = std::string("\\s+"); + auto prog = cudf::strings::regex_program::create(pattern); + { - auto result = cudf::strings::rsplit_re(sv, "\\s+", 1); + auto result = cudf::strings::rsplit_re(sv, pattern, 1); cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_re(sv, *prog, 1); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { - auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1); + auto result = cudf::strings::rsplit_record_re(sv, pattern, 1); using LCW = cudf::test::lists_column_wrapper; LCW expected( {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record_re(sv, *prog, 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // split everything is the same output as any maxsplit > 2 for the test input column here - result = cudf::strings::rsplit_record_re(sv, "\\s+", 3); - auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+"); + result = cudf::strings::rsplit_record_re(sv, pattern, 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, pattern); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + result = cudf::strings::rsplit_record_re(sv, *prog, 3); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } From 8c904f619df21f17bf88e19c1894ecb367769e0a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Oct 2022 15:00:33 -0400 Subject: [PATCH 07/11] update doxygen for future work --- cpp/src/strings/regex/regex_program_impl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h index 5c20d6e982f..70c5900cc03 100644 --- a/cpp/src/strings/regex/regex_program_impl.h +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -32,6 +32,9 @@ namespace strings { struct regex_program::regex_program_impl { detail::reprog prog; + // TODO: There will be other options added here in the future to handle issues + // 10852 and possibly others like 11979 + /** * @brief Return device instance of reprog object * From 915df870b611d0b83b5e757a13cf2736ae22d1b8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Nov 2022 21:12:19 -0400 Subject: [PATCH 08/11] delete def ctor; fix parameter order --- cpp/include/cudf/strings/regex/regex_program.hpp | 2 +- cpp/src/strings/regex/regex_program.cpp | 3 +-- cpp/src/strings/regex/regex_program_impl.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index da3ec87ffc4..38fdf39ec01 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -122,7 +122,7 @@ struct regex_program { ~regex_program(); private: - regex_program(); + regex_program() = delete; std::string _pattern; regex_flags _flags; diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp index 4a382caaa5a..108c1d07d85 100644 --- a/cpp/src/strings/regex/regex_program.cpp +++ b/cpp/src/strings/regex/regex_program.cpp @@ -32,7 +32,6 @@ std::unique_ptr regex_program::create(std::string_view pattern, return std::unique_ptr(p); } -regex_program::regex_program() = default; regex_program::~regex_program() = default; regex_program::regex_program(regex_program&& other) = default; regex_program& regex_program::operator=(regex_program&& other) = default; @@ -56,7 +55,7 @@ int32_t regex_program::groups_count() const { return _impl->prog.groups_count(); std::size_t regex_program::compute_working_memory_size(int32_t num_strings) const { - return detail::compute_working_memory_size(instructions_count(), num_strings); + return detail::compute_working_memory_size(num_strings, instructions_count()); } regex_program::regex_program_impl* regex_program::get_impl() const { return _impl.get(); } diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h index 70c5900cc03..b76ade4e43c 100644 --- a/cpp/src/strings/regex/regex_program_impl.h +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -40,7 +40,7 @@ struct regex_program::regex_program_impl { * * @param stream CUDA stream to use for device memory allocations and copies */ - auto create_prog_device(rmm::cuda_stream_view stream) + auto create_prog_device(rmm::cuda_stream_view stream) const { return detail::reprog_device::create(prog, stream); } From c4c686b26a5907ee9fc8a40a8041c8bb1e10da30 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Nov 2022 08:40:54 -0400 Subject: [PATCH 09/11] replace get_impl() with friend access class --- cpp/include/cudf/strings/regex/regex_program.hpp | 14 +++++++------- cpp/src/strings/contains.cu | 4 ++-- cpp/src/strings/extract/extract.cu | 2 +- cpp/src/strings/extract/extract_all.cu | 2 +- cpp/src/strings/regex/regex_program.cpp | 2 -- cpp/src/strings/regex/regex_program_impl.h | 11 ++++------- cpp/src/strings/replace/backref_re.cu | 2 +- cpp/src/strings/replace/multi_re.cu | 2 +- cpp/src/strings/replace/replace_re.cu | 2 +- cpp/src/strings/search/findall.cu | 2 +- cpp/src/strings/split/split_re.cu | 4 ++-- 11 files changed, 21 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 38fdf39ec01..9cd891956e1 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -104,13 +104,6 @@ struct regex_program { */ int32_t groups_count() const; - /** - * @brief Return implementation object - * - * @return impl object instance - */ - regex_program_impl* get_impl() const; - /** * @brief Return the pattern used to create this instance * @@ -130,7 +123,14 @@ struct regex_program { std::unique_ptr _impl; + /** + * @brief Constructor + * + * Called by create() + */ regex_program(std::string_view pattern, regex_flags flags, capture_groups capture); + + friend struct regex_device_builder; }; /** @} */ // end of doxygen group diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 88a829aac81..17abbe9927b 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -71,7 +71,7 @@ std::unique_ptr contains_impl(strings_column_view const& input, mr); if (input.is_empty()) { return results; } - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto d_results = results->mutable_view().data(); auto const d_strings = column_device_view::create(input.parent(), stream); @@ -111,7 +111,7 @@ std::unique_ptr count_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 2d5ff4df2ea..f99b0e63715 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -92,7 +92,7 @@ std::unique_ptr
extract(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 29dfa974c57..4e957f43624 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -106,7 +106,7 @@ std::unique_ptr extract_all_record( auto const d_strings = column_device_view::create(input.parent(), stream); // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); // The extract pattern should always include groups. auto const groups = d_prog->group_counts(); diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp index 108c1d07d85..39326a54b60 100644 --- a/cpp/src/strings/regex/regex_program.cpp +++ b/cpp/src/strings/regex/regex_program.cpp @@ -58,7 +58,5 @@ std::size_t regex_program::compute_working_memory_size(int32_t num_strings) cons return detail::compute_working_memory_size(num_strings, instructions_count()); } -regex_program::regex_program_impl* regex_program::get_impl() const { return _impl.get(); } - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h index b76ade4e43c..0ae3b0d3847 100644 --- a/cpp/src/strings/regex/regex_program_impl.h +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -34,15 +34,12 @@ struct regex_program::regex_program_impl { // TODO: There will be other options added here in the future to handle issues // 10852 and possibly others like 11979 +}; - /** - * @brief Return device instance of reprog object - * - * @param stream CUDA stream to use for device memory allocations and copies - */ - auto create_prog_device(rmm::cuda_stream_view stream) const +struct regex_device_builder { + static auto create_prog_device(regex_program const& p, rmm::cuda_stream_view stream) { - return detail::reprog_device::create(prog, stream); + return detail::reprog_device::create(p._impl->prog, stream); } }; diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 87d012aa595..383337c9088 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -114,7 +114,7 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty"); // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); // parse the repl string for back-ref indicators auto group_count = std::min(99, d_prog->group_counts()); // group count should NOT exceed 99 diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 1955eaaf6f0..9971780df65 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -147,7 +147,7 @@ std::unique_ptr replace_re( std::transform( patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) { auto h_prog = regex_program::create(ptn, flags, capture_groups::NON_CAPTURE); - return h_prog->get_impl()->create_prog_device(stream); + return regex_device_builder::create_prog_device(*h_prog, stream); }); // get the longest regex for the dispatcher diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 1228295b329..0e2f3169e8e 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -114,7 +114,7 @@ std::unique_ptr replace_re(strings_column_view const& input, string_view d_repl(replacement.data(), replacement.size()); // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto const maxrepl = max_replace_count.value_or(-1); diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index bda44743cf4..a623172ac97 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -103,7 +103,7 @@ std::unique_ptr findall( auto const d_strings = column_device_view::create(input.parent(), stream); // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); // Create lists offsets column auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 18661efebf0..8600f0aa465 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -202,7 +202,7 @@ std::unique_ptr
split_re(strings_column_view const& input, } // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto d_strings = column_device_view::create(input.parent(), stream); @@ -265,7 +265,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto const strings_count = input.size(); // create device object from regex_program - auto d_prog = prog.get_impl()->create_prog_device(stream); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); auto d_strings = column_device_view::create(input.parent(), stream); From f85897f0b22eee3a23fe5c945d1647731c2cfb52 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Nov 2022 14:57:49 -0400 Subject: [PATCH 10/11] add ctors for the impl class --- cpp/src/strings/regex/regex_program.cpp | 7 ++++--- cpp/src/strings/regex/regex_program_impl.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp index 39326a54b60..c64da213fcf 100644 --- a/cpp/src/strings/regex/regex_program.cpp +++ b/cpp/src/strings/regex/regex_program.cpp @@ -37,10 +37,11 @@ regex_program::regex_program(regex_program&& other) = default; regex_program& regex_program::operator=(regex_program&& other) = default; regex_program::regex_program(std::string_view pattern, regex_flags flags, capture_groups capture) - : _pattern(pattern), _flags(flags) + : _pattern(pattern), + _flags(flags), + _impl( + std::make_unique(detail::reprog::create_from(pattern, flags, capture))) { - auto p = new regex_program_impl{detail::reprog::create_from(pattern, flags, capture)}; - _impl = std::unique_ptr(p); } std::string regex_program::pattern() const { return _pattern; } diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h index 0ae3b0d3847..eede2225bce 100644 --- a/cpp/src/strings/regex/regex_program_impl.h +++ b/cpp/src/strings/regex/regex_program_impl.h @@ -32,6 +32,9 @@ namespace strings { struct regex_program::regex_program_impl { detail::reprog prog; + regex_program_impl(detail::reprog const& p) : prog(p) {} + regex_program_impl(detail::reprog&& p) : prog(p) {} + // TODO: There will be other options added here in the future to handle issues // 10852 and possibly others like 11979 }; From d2d997a11451ed55a0b9541e24442c5963915af6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Nov 2022 10:02:51 -0500 Subject: [PATCH 11/11] fix wording in doxygen comment for regex_program --- cpp/include/cudf/strings/regex/regex_program.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 9cd891956e1..2b606393719 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -35,7 +35,7 @@ namespace strings { * Create an instance from a regex pattern and use it to call the appropriate * strings APIs. An instance can be reused. * - * See the @ref md_regex "Regex Features" page for details on patterns and APIs the support regex. + * See the @ref md_regex "Regex Features" page for details on patterns and APIs that support regex. */ struct regex_program { struct regex_program_impl; @@ -43,7 +43,7 @@ struct regex_program { /** * @brief Create a program from a pattern * - * @throw cudf::logic_error If pattern is found to be invalid or contain unsupported features + * @throw cudf::logic_error If pattern is invalid or contains unsupported features * * @param pattern Regex pattern * @param flags Regex flags for interpreting special characters in the pattern