diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index f7f394ea048..a04915d1df8 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -83,18 +84,19 @@ static void BM_contains(benchmark::State& state, contains_type ct) auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; + auto program = cudf::strings::regex_program::create(pattern); for (auto _ : state) { cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (ct) { case contains_type::contains: // contains_re and matches_re use the same main logic - cudf::strings::contains_re(input, pattern); + cudf::strings::contains_re(input, *program); break; case contains_type::count: // counts occurrences of matches - cudf::strings::count_re(input, pattern); + cudf::strings::count_re(input, *program); break; case contains_type::findall: // returns occurrences of all matches - cudf::strings::findall(input, pattern); + cudf::strings::findall(input, *program); break; } } diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index 4e9ac2f5395..4760956e049 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -59,10 +60,11 @@ static void BM_extract(benchmark::State& state, int groups) auto input = cudf::gather( cudf::table_view{{samples_column}}, map->view(), cudf::out_of_bounds_policy::DONT_CHECK); cudf::strings_column_view strings_view(input->get_column(0).view()); + auto prog = cudf::strings::regex_program::create(pattern); for (auto _ : state) { cuda_event_timer raii(state, true); - auto results = cudf::strings::extract(strings_view, pattern); + auto results = cudf::strings::extract(strings_view, *prog); } state.SetBytesProcessed(state.iterations() * strings_view.chars_size()); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 7e9d6036750..b5dcf316a0e 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include +#include #include #include #include @@ -40,18 +41,20 @@ static void BM_replace(benchmark::State& state, replace_type rt) auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); cudf::test::strings_column_wrapper repls({"#", ""}); + auto prog = cudf::strings::regex_program::create("\\d+"); + auto prog_backref = cudf::strings::regex_program::create("(\\d+)"); for (auto _ : state) { cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { case replace_type::replace_re: // contains_re and matches_re use the same main logic - cudf::strings::replace_re(input, "\\d+"); + cudf::strings::replace_re(input, *prog); break; case replace_type::replace_re_multi: // counts occurrences of pattern cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls)); break; case replace_type::replace_backref: // returns occurrences of matches - cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X"); + cudf::strings::replace_with_backrefs(input, *prog_backref, "#\\1X"); break; } } diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index aee349415e3..aebc4ae7dab 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -34,33 +34,6 @@ struct regex_program; * @brief Strings APIs for regex contains, count, matches */ -/** - * @brief Returns a boolean column identifying rows which - * match the given regex pattern. - * - * @code{.pseudo} - * Example: - * s = ["abc","123","def456"] - * r = contains_re(s,"\\d+") - * r is now [false, true, true] - * @endcode - * - * Any null string entries return corresponding null output column entries. - * - * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. - * - * @param strings Strings instance for this operation. - * @param pattern Regex pattern to match to each string. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr contains_re( - strings_column_view const& strings, - std::string_view pattern, - regex_flags const flags = regex_flags::DEFAULT, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns a boolean column identifying rows which * match the given regex_program object @@ -89,26 +62,29 @@ std::unique_ptr contains_re( /** * @brief Returns a boolean column identifying rows which - * matching the given regex pattern but only at the beginning the string. + * match the given regex pattern. * * @code{.pseudo} * Example: * s = ["abc","123","def456"] - * r = matches_re(s,"\\d+") - * r is now [false, true, false] + * r = contains_re(s,"\\d+") + * r is now [false, true, true] * @endcode * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @deprecated Use @link contains_re contains_re(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * * @param strings Strings instance for this operation. * @param pattern Regex pattern to match to each string. * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of boolean results for each string. */ -std::unique_ptr matches_re( +[[deprecated]] std::unique_ptr contains_re( strings_column_view const& strings, std::string_view pattern, regex_flags const flags = regex_flags::DEFAULT, @@ -141,27 +117,30 @@ std::unique_ptr matches_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns the number of times the given regex pattern - * matches in each string. + * @brief Returns a boolean column identifying rows which + * matching the given regex pattern but only at the beginning the string. * * @code{.pseudo} * Example: - * s = ["abc","123","def45"] - * r = count_re(s,"\\d") - * r is now [0, 3, 2] + * s = ["abc","123","def456"] + * r = matches_re(s,"\\d+") + * r is now [false, true, false] * @endcode * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @deprecated Use @link matches_re matches_re(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * * @param strings Strings instance for this operation. - * @param pattern Regex pattern to match within each string. + * @param pattern Regex pattern to match to each string. * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New INT32 column with counts for each string. + * @return New column of boolean results for each string. */ -std::unique_ptr count_re( +[[deprecated]] std::unique_ptr matches_re( strings_column_view const& strings, std::string_view pattern, regex_flags const flags = regex_flags::DEFAULT, @@ -193,6 +172,36 @@ std::unique_ptr count_re( regex_program const& prog, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns the number of times the given regex pattern + * matches in each string. + * + * @code{.pseudo} + * Example: + * s = ["abc","123","def45"] + * r = count_re(s,"\\d") + * r is now [0, 3, 2] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @deprecated Use @link count_re count_re(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * + * @param strings Strings instance for this operation. + * @param pattern Regex pattern to match within each string. + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New INT32 column with counts for each string. + */ +[[deprecated]] std::unique_ptr count_re( + strings_column_view const& strings, + std::string_view pattern, + regex_flags const flags = regex_flags::DEFAULT, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a boolean column identifying rows which * match the given like pattern. diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index a80d971438d..e1a940259ac 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,14 +27,14 @@ namespace strings { struct regex_program; /** - * @addtogroup strings_substring + * @addtogroup strings_extract * @{ * @file */ /** * @brief Returns a table of strings columns where each column corresponds to the matching - * group specified in the given regular expression pattern. + * group specified in the given regex_program object * * All the strings for the first group will go in the first output column; the second group * go in the second column and so on. Null entries are added to the columns in row `i` if @@ -45,28 +45,27 @@ struct regex_program; * @code{.pseudo} * Example: * s = ["a1", "b2", "c3"] - * r = extract(s, "([ab])(\\d)") + * p = regex_program::create("([ab])(\\d)") + * r = extract(s, p) * r is now [ ["a", "b", null], * ["1", "2", null] ] * @endcode * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation. - * @param pattern The regular expression pattern with group indicators. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return Columns of strings extracted from the input column. + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned table's device memory + * @return Columns of strings extracted from the input column */ std::unique_ptr extract( strings_column_view const& strings, - std::string_view pattern, - regex_flags const flags = regex_flags::DEFAULT, + regex_program const& prog, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a table of strings columns where each column corresponds to the matching - * group specified in the given regex_program object + * group specified in the given regular expression pattern. * * All the strings for the first group will go in the first output column; the second group * go in the second column and so on. Null entries are added to the columns in row `i` if @@ -77,27 +76,31 @@ std::unique_ptr
extract( * @code{.pseudo} * Example: * s = ["a1", "b2", "c3"] - * p = regex_program::create("([ab])(\\d)") - * r = extract(s, p) + * r = extract(s, "([ab])(\\d)") * r is now [ ["a", "b", null], * ["1", "2", null] ] * @endcode * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation - * @param prog Regex program instance - * @param mr Device memory resource used to allocate the returned table's device memory - * @return Columns of strings extracted from the input column + * @deprecated Use @link extract extract(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate the returned table's device memory. + * @return Columns of strings extracted from the input column. */ -std::unique_ptr
extract( +[[deprecated]] std::unique_ptr
extract( strings_column_view const& strings, - regex_program const& prog, + std::string_view pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a lists column of strings where each string column row corresponds to the - * matching group specified in the given regular expression pattern. + * matching group specified in the given regex_program object * * All the matching groups for the first row will go in the first row output column; the second * row results will go into the second row output column and so on. @@ -108,7 +111,8 @@ std::unique_ptr
extract( * @code{.pseudo} * Example: * s = ["a1 b4", "b2", "c3 a5", "b", null] - * r = extract_all_record(s,"([ab])(\\d)") + * p = regex_program::create("([ab])(\\d)") + * r = extract_all_record(s, p) * r is now [ ["a", "1", "b", "4"], * ["b", "2"], * ["a", "5"], @@ -118,21 +122,19 @@ std::unique_ptr
extract( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation. - * @param pattern The regular expression pattern with group indicators. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate any returned device memory. - * @return Lists column containing strings extracted from the input column. + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate any returned device memory + * @return Lists column containing strings extracted from the input column */ std::unique_ptr extract_all_record( strings_column_view const& strings, - std::string_view pattern, - regex_flags const flags = regex_flags::DEFAULT, + regex_program const& prog, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a lists column of strings where each string column row corresponds to the - * matching group specified in the given regex_program object + * matching group specified in the given regular expression pattern. * * All the matching groups for the first row will go in the first row output column; the second * row results will go into the second row output column and so on. @@ -143,8 +145,7 @@ std::unique_ptr extract_all_record( * @code{.pseudo} * Example: * s = ["a1 b4", "b2", "c3 a5", "b", null] - * p = regex_program::create("([ab])(\\d)") - * r = extract_all_record(s, p) + * r = extract_all_record(s,"([ab])(\\d)") * r is now [ ["a", "1", "b", "4"], * ["b", "2"], * ["a", "5"], @@ -154,14 +155,19 @@ std::unique_ptr extract_all_record( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation - * @param prog Regex program instance - * @param mr Device memory resource used to allocate any returned device memory - * @return Lists column containing strings extracted from the input column + * @deprecated Use @link extract_all_record extract_all_record(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate any returned device memory. + * @return Lists column containing strings extracted from the input column. */ -std::unique_ptr extract_all_record( +[[deprecated]] std::unique_ptr extract_all_record( strings_column_view const& strings, - regex_program const& prog, + std::string_view pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 366e1eb0482..3ac881777e4 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,8 +33,8 @@ struct regex_program; */ /** - * @brief Returns a lists column of strings for each matching occurrence of the - * regex pattern within each string. + * @brief Returns a lists column of strings for each matching occurrence using + * the regex_program pattern within each string * * Each output row includes all the substrings within the corresponding input row * that match the given pattern. If no matches are found, the output row is empty. @@ -42,7 +42,8 @@ struct regex_program; * @code{.pseudo} * Example: * s = ["bunny", "rabbit", "hare", "dog"] - * r = findall(s, "[ab]") + * p = regex_program::create("[ab]") + * r = findall(s, p) * r is now a lists column like: * [ ["b"] * ["a","b","b"] @@ -54,21 +55,19 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param input Strings instance for this operation. - * @param pattern Regex pattern to match within each string. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New lists column of strings. + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New lists column of strings */ std::unique_ptr findall( strings_column_view const& input, - std::string_view pattern, - regex_flags const flags = regex_flags::DEFAULT, + regex_program const& prog, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a lists column of strings for each matching occurrence using - * the regex_program pattern within each string + * @brief Returns a lists column of strings for each matching occurrence of the + * regex pattern within each string. * * Each output row includes all the substrings within the corresponding input row * that match the given pattern. If no matches are found, the output row is empty. @@ -76,8 +75,7 @@ std::unique_ptr findall( * @code{.pseudo} * Example: * s = ["bunny", "rabbit", "hare", "dog"] - * p = regex_program::create("[ab]") - * r = findall(s, p) + * r = findall(s, "[ab]") * r is now a lists column like: * [ ["b"] * ["a","b","b"] @@ -89,14 +87,19 @@ std::unique_ptr findall( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param input Strings instance for this operation - * @param prog Regex program instance - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New lists column of strings + * @deprecated Use @link findall findall(strings_column_view const&, + * regex_program const&, rmm::mr::device_memory_resource*) @endlink + * + * @param input Strings instance for this operation. + * @param pattern Regex pattern to match within each string. + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New lists column of strings. */ -std::unique_ptr findall( +[[deprecated]] std::unique_ptr findall( strings_column_view const& input, - regex_program const& prog, + std::string_view pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 60c66956fb8..70e44a68c9a 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,53 +36,56 @@ struct regex_program; */ /** - * @brief For each string, replaces any character sequence matching the given pattern + * @brief For each string, replaces any character sequence matching the given regex * with the provided replacement string. * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation. - * @param pattern The regular expression pattern to search within each string. + * @param strings Strings instance for this operation + * @param prog Regex program instance * @param replacement The string used to replace the matched sequence in each string. * Default is an empty string. * @param max_replace_count The maximum number of times to replace the matched pattern * within each string. Default replaces every substring that is matched. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace_re( strings_column_view const& strings, - std::string_view pattern, + regex_program const& prog, string_scalar const& replacement = string_scalar(""), std::optional max_replace_count = std::nullopt, - regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief For each string, replaces any character sequence matching the given regex + * @brief For each string, replaces any character sequence matching the given pattern * with the provided replacement string. * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation - * @param prog Regex program instance + * @deprecated Use @link replace_re replace_re(strings_column_view const&, regex_program const&, + * string_scalar const&, std::optional, rmm::mr::device_memory_resource*) @endlink + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern to search within each string. * @param replacement The string used to replace the matched sequence in each string. * Default is an empty string. * @param max_replace_count The maximum number of times to replace the matched pattern * within each string. Default replaces every substring that is matched. - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New strings column. */ -std::unique_ptr replace_re( +[[deprecated]] std::unique_ptr replace_re( strings_column_view const& strings, - regex_program const& prog, + std::string_view pattern, string_scalar const& replacement = string_scalar(""), std::optional max_replace_count = std::nullopt, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -108,7 +111,7 @@ std::unique_ptr replace_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief For each string, replaces any character sequence matching the given pattern + * @brief For each string, replaces any character sequence matching the given regex * using the replacement template for back-references. * * Any null string entries return corresponding null output column entries. @@ -118,41 +121,44 @@ std::unique_ptr replace_re( * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also * if the index exceeds the group count specified in the pattern * - * @param strings Strings instance for this operation. - * @param pattern The regular expression patterns to search within each string. - * @param replacement The replacement template for creating the output string. - * @param flags Regex flags for interpreting special characters in the pattern. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param replacement The replacement template for creating the output string + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace_with_backrefs( strings_column_view const& strings, - std::string_view pattern, + regex_program const& prog, std::string_view replacement, - regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief For each string, replaces any character sequence matching the given regex + * @brief For each string, replaces any character sequence matching the given pattern * using the replacement template for back-references. * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @deprecated Use @link replace_with_backrefs replace_with_backrefs(strings_column_view const&, + * regex_program const&, string_view, rmm::mr::device_memory_resource*) @endlink + * * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also * if the index exceeds the group count specified in the pattern * - * @param strings Strings instance for this operation - * @param prog Regex program instance - * @param replacement The replacement template for creating the output string - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column + * @param strings Strings instance for this operation. + * @param pattern The regular expression patterns to search within each string. + * @param replacement The replacement template for creating the output string. + * @param flags Regex flags for interpreting special characters in the pattern. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New strings column. */ -std::unique_ptr replace_with_backrefs( +[[deprecated]] std::unique_ptr replace_with_backrefs( strings_column_view const& strings, - regex_program const& prog, + std::string_view pattern, std::string_view replacement, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index c6bd1345ae6..fac5f130064 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ struct regex_program; /** * @brief Splits strings elements into a table of strings columns - * using a regex pattern to delimit each string. + * using a regex_program's pattern to delimit each string * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of strings[row]` @@ -51,15 +51,19 @@ struct regex_program; * corresponding row of the first column. * A null row will produce corresponding null rows in the output table. * + * The regex_program's regex_flags are ignored. + * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = split_re(s, "[_ ]") + * p1 = regex_program::create("[_ ]") + * s1 = split_re(s, p1) * s1 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc", "", "ab", "cd"], * ["def", "bc", "cd", ""], * ["g", null, null, null] ] - * s2 = split_re(s, "[ _]", 1) + * p2 = regex_program::create("[ _]") + * s2 = split_re(s, p2, 1) * s2 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc def_g", "_bc", "ab cd", "cd "] ] @@ -67,22 +71,22 @@ struct regex_program; * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split. - * @param pattern The regex pattern for delimiting characters within each string. + * @param input A column of string elements to be split + * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return A table of columns of strings. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings */ std::unique_ptr
split_re( strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Splits strings elements into a table of strings columns - * using a regex_program's pattern to delimit each string + * using a regex pattern to delimit each string. * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of strings[row]` @@ -99,42 +103,41 @@ std::unique_ptr
split_re( * corresponding row of the first column. * A null row will produce corresponding null rows in the output table. * - * The regex_program's regex_flags are ignored. - * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * p1 = regex_program::create("[_ ]") - * s1 = split_re(s, p1) + * s1 = split_re(s, "[_ ]") * s1 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc", "", "ab", "cd"], * ["def", "bc", "cd", ""], * ["g", null, null, null] ] - * p2 = regex_program::create("[ _]") - * s2 = split_re(s, p2, 1) + * s2 = split_re(s, "[ _]", 1) * s2 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc def_g", "_bc", "ab cd", "cd "] ] * @endcode * + * @deprecated Use @link split_re split_re(strings_column_view const&, + * regex_program const&, size_type, rmm::mr::device_memory_resource*) @endlink + * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split - * @param prog Regex program instance + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory - * @return A table of columns of strings + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. */ -std::unique_ptr
split_re( +[[deprecated]] std::unique_ptr
split_re( strings_column_view const& input, - regex_program const& prog, + std::string_view pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits strings elements into a table of strings columns - * using a regex pattern to delimit each string starting from the end of the string. + * @brief Splits strings elements into a table of strings columns using a + * regex_program's pattern to delimit each string starting from the end of the string * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` @@ -153,15 +156,19 @@ std::unique_ptr
split_re( * corresponding row of the first column. * A null row will produce corresponding null rows in the output table. * + * The regex_program's regex_flags are ignored. + * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = rsplit_re(s, "[_ ]") + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_re(s, p1) * s1 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc", "", "ab", "cd"], * ["def", "bc", "cd", ""], * ["g", null, null, null] ] - * s2 = rsplit_re(s, "[ _]", 1) + * p2 = regex_program::create("[ _]") + * s2 = rsplit_re(s, p2, 1) * s2 is a table of strings columns: * [ ["a_bc def", "a_", "_ab", "ab"], * ["g", "bc", "cd", "cd "] ] @@ -170,7 +177,7 @@ std::unique_ptr
split_re( * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. - * @param pattern The regex pattern for delimiting characters within each string. + * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. @@ -178,13 +185,13 @@ std::unique_ptr
split_re( */ std::unique_ptr
rsplit_re( strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits strings elements into a table of strings columns using a - * regex_program's pattern to delimit each string starting from the end of the string + * @brief Splits strings elements into a table of strings columns + * using a regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` @@ -203,42 +210,41 @@ std::unique_ptr
rsplit_re( * corresponding row of the first column. * A null row will produce corresponding null rows in the output table. * - * The regex_program's regex_flags are ignored. - * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * p1 = regex_program::create("[_ ]") - * s1 = rsplit_re(s, p1) + * s1 = rsplit_re(s, "[_ ]") * s1 is a table of strings columns: * [ ["a", "a", "", "ab"], * ["bc", "", "ab", "cd"], * ["def", "bc", "cd", ""], * ["g", null, null, null] ] - * p2 = regex_program::create("[ _]") - * s2 = rsplit_re(s, p2, 1) + * s2 = rsplit_re(s, "[ _]", 1) * s2 is a table of strings columns: * [ ["a_bc def", "a_", "_ab", "ab"], * ["g", "bc", "cd", "cd "] ] * @endcode * + * @deprecated Use @link rsplit_re rsplit_re(strings_column_view const&, + * regex_program const&, size_type, rmm::mr::device_memory_resource*) @endlink + * * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. - * @param prog Regex program instance + * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. * @return A table of columns of strings. */ -std::unique_ptr
rsplit_re( +[[deprecated]] std::unique_ptr
rsplit_re( strings_column_view const& input, - regex_program const& prog, + std::string_view pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Splits strings elements into a list column of strings - * using the given regex pattern to delimit each string. + * using the given regex_program to delimit each string * * Each element generates an array of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -255,15 +261,19 @@ std::unique_ptr
rsplit_re( * An empty input string will produce a corresponding empty list item output row. * A null row will produce a corresponding null output row. * + * The regex_program's regex_flags are ignored. + * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = split_record_re(s, "[_ ]") + * p1 = regex_program::create("[_ ]") + * s1 = split_record_re(s, p1) * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = split_record_re(s, "[ _]", 1) + * p2 = regex_program::create("[ _]") + * s2 = split_record_re(s, p2, 1) * s2 is a lists column of strings: * [ ["a", "bc def_g"], * ["a", "_bc"], @@ -275,22 +285,22 @@ std::unique_ptr
rsplit_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param input A column of string elements to be split. - * @param pattern The regex pattern for delimiting characters within each string. + * @param input A column of string elements to be split + * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. + * @param mr Device memory resource used to allocate the returned result's device memory * @return Lists column of strings. */ std::unique_ptr split_record_re( strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Splits strings elements into a list column of strings - * using the given regex_program to delimit each string + * using the given regex pattern to delimit each string. * * Each element generates an array of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -307,19 +317,15 @@ std::unique_ptr split_record_re( * An empty input string will produce a corresponding empty list item output row. * A null row will produce a corresponding null output row. * - * The regex_program's regex_flags are ignored. - * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * p1 = regex_program::create("[_ ]") - * s1 = split_record_re(s, p1) + * s1 = split_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * p2 = regex_program::create("[ _]") - * s2 = split_record_re(s, p2, 1) + * s2 = split_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a", "bc def_g"], * ["a", "_bc"], @@ -327,26 +333,29 @@ std::unique_ptr split_record_re( * ["ab", "cd "] ] * @endcode * - * @throw cudf::logic_error if `pattern` is empty. - * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param input A column of string elements to be split - * @param prog Regex program instance + * @deprecated Use @link split_record_re split_record_re(strings_column_view const&, + * regex_program const&, size_type, rmm::mr::device_memory_resource*) @endlink + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory + * @param mr Device memory resource used to allocate the returned result's device memory. * @return Lists column of strings. */ -std::unique_ptr split_record_re( +[[deprecated]] std::unique_ptr split_record_re( strings_column_view const& input, - regex_program const& prog, + std::string_view pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits strings elements into a list column of strings - * using the given regex pattern to delimit each string starting from the end of the string. + * @brief Splits strings elements into a list column of strings using the given + * regex_program to delimit each string starting from the end of the string * * Each element generates a vector of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -365,15 +374,19 @@ std::unique_ptr split_record_re( * An empty input string will produce a corresponding empty list item output row. * A null row will produce a corresponding null output row. * + * The regex_program's regex_flags are ignored. + * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = rsplit_record_re(s, "[_ ]") + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_record_re(s, p1) * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = rsplit_record_re(s, "[ _]", 1) + * p2 = regex_program::create("[ _]") + * s2 = rsplit_record_re(s, p2, 1) * s2 is a lists column of strings: * [ ["a_bc def", "g"], * ["a_", "bc"], @@ -385,22 +398,22 @@ std::unique_ptr split_record_re( * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split. - * @param pattern The regex pattern for delimiting characters within each string. + * @param input A column of string elements to be split + * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings */ std::unique_ptr rsplit_record_re( strings_column_view const& input, - std::string_view pattern, + regex_program const& prog, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits strings elements into a list column of strings using the given - * regex_program to delimit each string starting from the end of the string + * @brief Splits strings elements into a list column of strings + * using the given regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -419,19 +432,15 @@ std::unique_ptr rsplit_record_re( * An empty input string will produce a corresponding empty list item output row. * A null row will produce a corresponding null output row. * - * The regex_program's regex_flags are ignored. - * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * p1 = regex_program::create("[_ ]") - * s1 = rsplit_record_re(s, p1) + * s1 = rsplit_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * p2 = regex_program::create("[ _]") - * s2 = rsplit_record_re(s, p2, 1) + * s2 = rsplit_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a_bc def", "g"], * ["a_", "bc"], @@ -441,18 +450,21 @@ std::unique_ptr rsplit_record_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @deprecated Use @link rsplit_record_re rsplit_record_re(strings_column_view const&, + * regex_program const&, size_type, rmm::mr::device_memory_resource*) @endlink + * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split - * @param prog Regex program instance + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory - * @return Lists column of strings + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. */ -std::unique_ptr rsplit_record_re( +[[deprecated]] std::unique_ptr rsplit_record_re( strings_column_view const& input, - regex_program const& prog, + std::string_view pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index b1d56f43057..628d48f64cd 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -128,6 +128,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_extract Extracting * @defgroup strings_json JSON * @defgroup strings_regex Regex * @} diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 8ddd8eedb51..5331c4c34d8 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -142,15 +142,13 @@ TEST_F(StringsContainsTests, ContainsTest) for (int idx = 0; idx < static_cast(patterns.size()); ++idx) { std::string ptn = patterns[idx]; - auto results = cudf::strings::contains_re(strings_view, ptn); bool* h_expected = h_expecteds.data() + (idx * h_strings.size()); cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(ptn); - results = cudf::strings::contains_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(ptn); + auto results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -167,54 +165,46 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { auto const pattern = std::string("lazy"); - auto results = cudf::strings::matches_re(strings_view, pattern); bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto const pattern = std::string("\\d+"); - auto results = cudf::strings::matches_re(strings_view, pattern); bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto const pattern = std::string("@\\w+"); - auto results = cudf::strings::matches_re(strings_view, pattern); bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto const pattern = std::string(".*"); - auto results = cudf::strings::matches_re(strings_view, pattern); bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( h_expected, h_expected + h_strings.size(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -236,12 +226,10 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) std::string pattern = "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" "$"; - auto results = cudf::strings::matches_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {true, true, false, false, false, false, true, true, true, true}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } { // is_loopback: 72 instructions @@ -249,12 +237,10 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) "^127\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))" "\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))" "\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))$"; - auto results = cudf::strings::matches_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {false, false, false, false, false, false, false, false, false, true}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } { // is_multicast: 79 instructions @@ -262,12 +248,10 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) "^(2(2[4-9]|3[0-9]))\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))" "\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))" "\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))$"; - auto results = cudf::strings::matches_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {false, false, false, false, false, false, true, true, false, false}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::matches_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } @@ -279,40 +263,30 @@ TEST_F(StringsContainsTests, OctalTest) auto expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 0, 0}); auto pattern = std::string("\\101"); - auto results = cudf::strings::contains_re(strings_view, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("\\1013"); - results = cudf::strings::contains_re(strings_view, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("D*\\101\\063"); - results = cudf::strings::contains_re(strings_view, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("\\719"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("[\\7][\\11][\\15]"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -334,20 +308,16 @@ TEST_F(StringsContainsTests, HexTest) str << "\\x" << std::setfill('0') << std::setw(2) << std::hex << static_cast(ch); std::string pattern = str.str(); - auto results = cudf::strings::contains_re(strings_view, pattern); // only one element in the input should match ch auto true_dat = cudf::detail::make_counting_transform_iterator( 0, [ch](auto idx) { return ch == static_cast(idx); }); cudf::test::fixed_width_column_wrapper expected(true_dat, true_dat + count); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // also test hex character appearing in character class brackets pattern = "[" + pattern + "]"; - results = cudf::strings::contains_re(strings_view, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -366,43 +336,33 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) auto strings_view = cudf::strings_column_view(input); auto pattern = std::string("A"); - auto results = cudf::strings::contains_re(strings_view, pattern); auto expected = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("B"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("J\\0B"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("[G-J][\\0]B"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); pattern = std::string("[A-D][\\x00]B"); - results = cudf::strings::contains_re(strings_view, pattern); expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::contains_re(strings_view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::contains_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -427,32 +387,26 @@ TEST_F(StringsContainsTests, CountTest) auto strings_view = cudf::strings_column_view(strings); { auto pattern = std::string("[tT]he"); - auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {2, 0, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto pattern = std::string("@\\w+"); - auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {1, 1, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto pattern = std::string("\\d+:\\d+"); - auto results = cudf::strings::count_re(strings_view, pattern); cudf::test::fixed_width_column_wrapper expected( {0, 0, 2, 1, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -465,71 +419,57 @@ TEST_F(StringsContainsTests, FixedQuantifier) { // exact match auto pattern = std::string("a{3}"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 2}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // range match (greedy quantifier) auto pattern = std::string("a{3,5}"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // minimum match (greedy quantifier) auto pattern = std::string("a{2,}"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // range match (lazy quantifier) auto pattern = std::string("a{2,4}?"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 1, 2, 2, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // minimum match (lazy quantifier) auto pattern = std::string("a{1,}?"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({1, 2, 3, 4, 5, 6}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // zero match auto pattern = std::string("aaaa{0}"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 1, 1, 1, 2}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { // poorly formed auto pattern = std::string("aaaa{n,m}"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -557,20 +497,16 @@ TEST_F(StringsContainsTests, OverlappedClasses) { auto pattern = std::string("[e-gb-da-c]"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({7, 4, 0, 0, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto pattern = std::string("[á-éê-ú]"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 6, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -582,20 +518,16 @@ TEST_F(StringsContainsTests, NegatedClasses) { auto pattern = std::string("[^a-f]"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({1, 4, 0, 5, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto pattern = std::string("[^a-eá-é]"); - auto results = cudf::strings::count_re(sv, pattern); cudf::test::fixed_width_column_wrapper expected({2, 5, 0, 1, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::count_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::count_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -607,31 +539,35 @@ TEST_F(StringsContainsTests, IncompleteClassesRange) { cudf::test::fixed_width_column_wrapper expected({1, 0, 0, 1, 1}); - auto results = cudf::strings::contains_re(sv, "[a-z]"); + auto prog = cudf::strings::regex_program::create("[a-z]"); + auto results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create("[a-m-z]"); // same as [a-z] - results = cudf::strings::contains_re(sv, *prog); + prog = cudf::strings::regex_program::create("[a-m-z]"); // same as [a-z] + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::fixed_width_column_wrapper expected({1, 1, 0, 1, 1}); - auto results = cudf::strings::contains_re(sv, "[g-]"); + auto prog = cudf::strings::regex_program::create("[g-]"); + auto results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create("[-k]"); - results = cudf::strings::contains_re(sv, *prog); + prog = cudf::strings::regex_program::create("[-k]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::fixed_width_column_wrapper expected({1, 1, 0, 0, 1}); - auto results = cudf::strings::contains_re(sv, "[-]"); + auto prog = cudf::strings::regex_program::create("[-]"); + auto results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::contains_re(sv, "[+--]"); + prog = cudf::strings::regex_program::create("[+--]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create("[a-c-]"); - results = cudf::strings::contains_re(sv, *prog); + prog = cudf::strings::regex_program::create("[a-c-]"); + results = cudf::strings::contains_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); prog = cudf::strings::regex_program::create("[-d-f]"); results = cudf::strings::contains_re(sv, *prog); @@ -650,37 +586,25 @@ TEST_F(StringsContainsTests, MultiLine) auto prog_ml = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); - auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + auto results = cudf::strings::contains_re(view, *prog_ml); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, pattern); expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, *prog); + results = cudf::strings::contains_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, pattern); expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, *prog); + results = cudf::strings::matches_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::MULTILINE); auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, pattern); expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, *prog); + results = cudf::strings::count_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } @@ -723,50 +647,36 @@ TEST_F(StringsContainsTests, DotAll) auto prog_dotall = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); - auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog_dotall); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, *prog_dotall); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, pattern); expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, *prog); + results = cudf::strings::contains_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_dotall); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, *prog_dotall); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, pattern); expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - results = cudf::strings::matches_re(view, *prog); + results = cudf::strings::matches_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); pattern = std::string("a.*?f"); prog = cudf::strings::regex_program::create(pattern); prog_dotall = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); - results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::DOTALL); auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); + results = cudf::strings::count_re(view, *prog_dotall); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, *prog_dotall); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, pattern); expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - results = cudf::strings::count_re(view, *prog); + results = cudf::strings::count_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); auto both_flags = static_cast(cudf::strings::regex_flags::DOTALL | cudf::strings::regex_flags::MULTILINE); - results = cudf::strings::count_re(view, pattern, both_flags); expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - auto prog_both = cudf::strings::regex_program::create(pattern, both_flags); - results = cudf::strings::count_re(view, *prog_both); + auto prog_both = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::count_re(view, *prog_both); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } @@ -782,18 +692,14 @@ TEST_F(StringsContainsTests, ASCII) "\\w+\\s+\\d+"}; for (auto ptn : patterns) { - auto results = cudf::strings::contains_re(view, ptn, cudf::strings::regex_flags::ASCII); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - auto prog = cudf::strings::regex_program::create(ptn, cudf::strings::regex_flags::ASCII); - results = cudf::strings::contains_re(view, *prog); + auto prog = cudf::strings::regex_program::create(ptn, cudf::strings::regex_flags::ASCII); + auto results = cudf::strings::contains_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - results = cudf::strings::contains_re(view, ptn); expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - prog = cudf::strings::regex_program::create(ptn); - results = cudf::strings::contains_re(view, *prog); + prog = cudf::strings::regex_program::create(ptn); + results = cudf::strings::contains_re(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); } } @@ -804,6 +710,7 @@ TEST_F(StringsContainsTests, MediumRegex) std::string medium_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com"; + auto prog = cudf::strings::regex_program::create(medium_regex); std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -816,21 +723,21 @@ TEST_F(StringsContainsTests, MediumRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, medium_regex); + auto results = cudf::strings::contains_re(strings_view, *prog); bool h_expected[] = {true, false, false}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, medium_regex); + auto results = cudf::strings::matches_re(strings_view, *prog); bool h_expected[] = {true, false, false}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, medium_regex); + auto results = cudf::strings::count_re(strings_view, *prog); int32_t h_expected[] = {1, 0, 0}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); @@ -844,6 +751,7 @@ TEST_F(StringsContainsTests, LargeRegex) std::string large_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz"; + auto prog = cudf::strings::regex_program::create(large_regex); std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -856,21 +764,21 @@ TEST_F(StringsContainsTests, LargeRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, large_regex); + auto results = cudf::strings::contains_re(strings_view, *prog); bool h_expected[] = {true, false, false}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, large_regex); + auto results = cudf::strings::matches_re(strings_view, *prog); bool h_expected[] = {true, false, false}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, large_regex); + auto results = cudf::strings::count_re(strings_view, *prog); int32_t h_expected[] = {1, 0, 0}; cudf::test::fixed_width_column_wrapper expected(h_expected, h_expected + h_strings.size()); @@ -883,21 +791,21 @@ TEST_F(StringsContainsTests, ExtraLargeRegex) // This results in 321 regex instructions which is above the 'large' range. std::string data(320, '0'); cudf::test::strings_column_wrapper strings({data, data, data, data, data, "00"}); - std::string pattern = data; + auto prog = cudf::strings::regex_program::create(data); auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, pattern); + auto results = cudf::strings::contains_re(strings_view, *prog); cudf::test::fixed_width_column_wrapper expected({true, true, true, true, true, false}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, pattern); + auto results = cudf::strings::matches_re(strings_view, *prog); cudf::test::fixed_width_column_wrapper expected({true, true, true, true, true, false}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, pattern); + auto results = cudf::strings::count_re(strings_view, *prog); cudf::test::fixed_width_column_wrapper expected({1, 1, 1, 1, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 07d1b99da5a..1ca218a5522 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,7 +61,6 @@ TEST_F(StringsExtractTests, ExtractTest) nullptr}; std::string pattern = "(\\w+) (\\w+)"; - auto results = cudf::strings::extract(strings_view, pattern); cudf::test::strings_column_wrapper expected1( h_expecteds.data(), @@ -76,10 +75,9 @@ TEST_F(StringsExtractTests, ExtractTest) columns.push_back(expected1.release()); columns.push_back(expected2.release()); cudf::table expected(std::move(columns)); - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(strings_view, pattern); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(strings_view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } @@ -102,7 +100,6 @@ TEST_F(StringsExtractTests, ExtractDomainTest) auto strings_view = cudf::strings_column_view(strings); std::string pattern = "([\\w]+[\\.].*[^/]|[\\-\\w]+[\\.].*[^/])"; - auto results = cudf::strings::extract(strings_view, pattern); cudf::test::strings_column_wrapper expected1({ "www.google.com", @@ -121,10 +118,9 @@ TEST_F(StringsExtractTests, ExtractDomainTest) "a23-44-13-2.deploy.static.akamaitechnologies.com", }); cudf::table_view expected{{expected1}}; - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(strings_view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } @@ -154,11 +150,9 @@ TEST_F(StringsExtractTests, ExtractEventTest) for (std::size_t idx = 0; idx < patterns.size(); ++idx) { auto pattern = patterns[idx]; - auto results = cudf::strings::extract(strings_view, pattern); cudf::test::strings_column_wrapper expected({expecteds[idx]}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(strings_view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected); } } @@ -170,23 +164,19 @@ TEST_F(StringsExtractTests, MultiLine) auto view = cudf::strings_column_view(input); auto pattern = std::string("(^[a-c]+$)"); - auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::MULTILINE); cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc", "abc"}, {1, 1, 1, 0, 1, 1}); auto expected = cudf::table_view{{expected_multiline}}; - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); - results = cudf::strings::extract(view, *prog); + auto results = cudf::strings::extract(view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); pattern = std::string("^([a-c]+)$"); - results = cudf::strings::extract(view, pattern); cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", "abc", ""}, {0, 0, 1, 0, 1, 0}); expected = cudf::table_view{{expected_default}}; - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } @@ -196,21 +186,17 @@ TEST_F(StringsExtractTests, DotAll) auto view = cudf::strings_column_view(input); auto pattern = std::string("(a.*f)"); - auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::DOTALL); cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""}, {1, 1, 1, 0}); auto expected = cudf::table_view{{expected_dotall}}; - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); - results = cudf::strings::extract(view, *prog); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); + auto results = cudf::strings::extract(view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - results = cudf::strings::extract(view, pattern); cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0}); expected = cudf::table_view{{expected_default}}; - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(view, *prog); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::extract(view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } @@ -224,7 +210,6 @@ TEST_F(StringsExtractTests, EmptyExtractTest) auto strings_view = cudf::strings_column_view(strings); auto pattern = std::string("([^_]*)\\Z"); - auto results = cudf::strings::extract(strings_view, pattern); std::vector h_expected{nullptr, "AAA", "A", "", "", ""}; cudf::test::strings_column_wrapper expected( @@ -234,9 +219,8 @@ TEST_F(StringsExtractTests, EmptyExtractTest) std::vector> columns; columns.push_back(expected.release()); cudf::table table_expected(std::move(columns)); - CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract(strings_view, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(strings_view, *prog); CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); } @@ -250,7 +234,6 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+) (\\w+)"); - auto results = cudf::strings::extract_all_record(sv, pattern); bool valids[] = {true, true, true, false, false, false, true}; using LCW = cudf::test::lists_column_wrapper; @@ -262,9 +245,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{"4", "pare"}}, valids); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::extract_all_record(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract_all_record(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } @@ -276,9 +258,7 @@ TEST_F(StringsExtractTests, Errors) auto pattern = std::string("\\w+"); auto prog = cudf::strings::regex_program::create(pattern); - EXPECT_THROW(cudf::strings::extract(sv, pattern), cudf::logic_error); EXPECT_THROW(cudf::strings::extract(sv, *prog), cudf::logic_error); - EXPECT_THROW(cudf::strings::extract_all_record(sv, pattern), cudf::logic_error); EXPECT_THROW(cudf::strings::extract_all_record(sv, *prog), cudf::logic_error); } @@ -288,6 +268,7 @@ TEST_F(StringsExtractTests, MediumRegex) std::string medium_regex = "hello @abc @def (world) The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com"; + auto prog = cudf::strings::regex_program::create(medium_regex); std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -302,7 +283,7 @@ TEST_F(StringsExtractTests, MediumRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::extract(strings_view, medium_regex); + auto results = cudf::strings::extract(strings_view, *prog); std::vector h_expected{"world", nullptr, nullptr}; cudf::test::strings_column_wrapper expected( h_expected.begin(), @@ -317,6 +298,7 @@ TEST_F(StringsExtractTests, LargeRegex) std::string large_regex = "hello @abc @def world The (quick) brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz"; + auto prog = cudf::strings::regex_program::create(large_regex); std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -331,7 +313,7 @@ TEST_F(StringsExtractTests, LargeRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::extract(strings_view, large_regex); + auto results = cudf::strings::extract(strings_view, *prog); std::vector h_expected{"quick", nullptr, nullptr}; cudf::test::strings_column_wrapper expected( h_expected.begin(), diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 6428be28e0a..c105f4ace6f 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,7 +39,6 @@ TEST_F(StringsFindallTests, FindallTest) auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+)-(\\w+)"); - auto results = cudf::strings::findall(sv, pattern); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"3-A"}, @@ -51,9 +50,8 @@ TEST_F(StringsFindallTests, FindallTest) LCW{}, LCW{"25-9000"}}, valids); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::findall(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::findall(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } @@ -63,12 +61,10 @@ TEST_F(StringsFindallTests, Multiline) auto view = cudf::strings_column_view(input); auto pattern = std::string("(^abc$)"); - auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::MULTILINE); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"abc", "abc"}, LCW{"abc"}, LCW{"abc"}, LCW{}, LCW{"abc"}}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); - results = cudf::strings::findall(view, *prog); + auto results = cudf::strings::findall(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } @@ -78,12 +74,10 @@ TEST_F(StringsFindallTests, DotAll) auto view = cudf::strings_column_view(input); auto pattern = std::string("(b.*f)"); - auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::DOTALL); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdef"}, LCW{}}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); - results = cudf::strings::findall(view, *prog); + auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL); + auto results = cudf::strings::findall(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } @@ -91,10 +85,11 @@ TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. std::string medium_regex = "(\\w+) (\\w+) (\\d+)"; + auto prog = cudf::strings::regex_program::create(medium_regex); cudf::test::strings_column_wrapper input({"first words 1234 and just numbers 9876", "neither"}); auto strings_view = cudf::strings_column_view(input); - auto results = cudf::strings::findall(strings_view, medium_regex); + auto results = cudf::strings::findall(strings_view, *prog); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"first words 1234", "just numbers 9876"}, LCW{}}); @@ -107,6 +102,7 @@ TEST_F(StringsFindallTests, LargeRegex) std::string large_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz"; + auto prog = cudf::strings::regex_program::create(large_regex); cudf::test::strings_column_wrapper input( {"hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -119,7 +115,7 @@ TEST_F(StringsFindallTests, LargeRegex) "qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"}); auto strings_view = cudf::strings_column_view(input); - auto results = cudf::strings::findall(strings_view, large_regex); + auto results = cudf::strings::findall(strings_view, *prog); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{large_regex.c_str()}, LCW{}, LCW{}}); diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 840d998e56c..d7d0576d0eb 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,12 +54,10 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest) auto pattern = std::string("(\\bthe\\b)"); auto repl = cudf::string_scalar("="); - auto results = cudf::strings::replace_re(strings_view, pattern, repl); cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(strings_view, *prog, repl); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_re(strings_view, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -120,8 +118,6 @@ TEST_F(StringsReplaceRegexTest, WithEmptyPattern) auto repls_view = cudf::strings_column_view(repls); auto results = cudf::strings::replace_re(strings_view, patterns, repls_view); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); - results = cudf::strings::replace_re(strings_view, "", repl); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); auto prog = cudf::strings::regex_program::create(empty_pattern); results = cudf::strings::replace_re(strings_view, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings); @@ -134,15 +130,11 @@ TEST_F(StringsReplaceRegexTest, MultiReplacement) auto pattern = std::string("aba"); auto repl = cudf::string_scalar("_"); - auto results = cudf::strings::replace_re(sv, pattern, repl, 2); cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(sv, *prog, repl, 2); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_re(sv, *prog, repl, 2); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = cudf::strings::replace_re(sv, pattern, repl, 0); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); results = cudf::strings::replace_re(sv, *prog, repl, 0); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); } @@ -154,19 +146,15 @@ TEST_F(StringsReplaceRegexTest, WordBoundary) auto pattern = std::string("\\b"); auto repl = cudf::string_scalar("X"); - auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper( {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX", "X_X", "Xa_bX"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(sv, *prog, repl); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); pattern = std::string("\\B"); - results = cudf::strings::replace_re(sv, pattern, repl); expected = cudf::test::strings_column_wrapper( {"aXbXa bXcXd\naXbXa", "zXéXz", "AX1XBX2-éX3", "e é", "_", "aX_Xb"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); @@ -180,17 +168,13 @@ TEST_F(StringsReplaceRegexTest, Alternation) auto pattern = std::string("(^|\\s)\\d+(\\s|$)"); auto repl = cudf::string_scalar("_"); - auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(sv, *prog, repl); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); pattern = std::string("(\\s|^)\\d+($|\\s)"); - results = cudf::strings::replace_re(sv, pattern, repl); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); @@ -203,19 +187,15 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) auto pattern = std::string("D*"); auto repl = cudf::string_scalar("_"); - auto results = cudf::strings::replace_re(sv, pattern, repl); auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(sv, *prog, repl); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); pattern = std::string("D?s?"); - results = cudf::strings::replace_re(sv, pattern, repl); expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_re(sv, *prog, repl); + prog = cudf::strings::regex_program::create(pattern); + results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -229,16 +209,12 @@ TEST_F(StringsReplaceRegexTest, Multiline) // single-replace auto pattern = std::string("^aba$"); auto repl = cudf::string_scalar("_"); - auto results = cudf::strings::replace_re(sv, pattern, repl, std::nullopt, multiline); cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml); - auto prog = cudf::strings::regex_program::create(pattern, multiline); - results = cudf::strings::replace_re(sv, *prog, repl); + auto prog = cudf::strings::regex_program::create(pattern, multiline); + auto results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml); - results = cudf::strings::replace_re(sv, pattern, repl); cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::replace_re(sv, *prog, repl); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); @@ -257,18 +233,14 @@ TEST_F(StringsReplaceRegexTest, Multiline) // backref-replace auto repl_template = std::string("[\\1]"); pattern = std::string("(^aba)"); - results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template, multiline); cudf::test::strings_column_wrapper br_expected_ml( {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml); prog = cudf::strings::regex_program::create(pattern, multiline); results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml); - results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper br_expected( {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); @@ -298,12 +270,10 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) auto pattern = std::string("(\\w) (\\w)"); auto repl_template = std::string("\\1-\\2"); - auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -314,13 +284,11 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest) auto pattern = std::string("(\\d+)-(\\d+)"); auto repl_template = std::string("${2} X ${1}0"); - auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -332,7 +300,6 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest) auto pattern = std::string("([a-z])-([a-zé])"); auto repl_template = std::string("X\\2+\\1Z"); - auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected({"A543", "Z756", @@ -341,9 +308,8 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest) "twXt+oZhréé fouXf+rZivé", "abcXé+dZfgh", "tésXs+tZtrinXa+gZgain"}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -356,17 +322,13 @@ TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier) auto pattern = std::string("

(.*)

(.*)

"); auto repl_template = std::string("

\\1

\\2

"); - auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected( {"

title

ABC

", "

1234567

XYZ

"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); pattern = std::string("

([a-z\\d]+)

([A-Z]+)

"); - results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); prog = cudf::strings::regex_program::create(pattern); results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); @@ -380,7 +342,6 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) auto pattern = std::string("(TEST)(\\d+)"); auto repl_template = std::string("${0}: ${1}, ${2}; "); - auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template); cudf::test::strings_column_wrapper expected({ "TEST123: TEST, 123; ", @@ -389,9 +350,8 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) "TEST1: TEST, 1; -TEST-T", "TES3", }); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto prog = cudf::strings::regex_program::create(pattern); - results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } @@ -401,9 +361,12 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) auto view = cudf::strings_column_view(strings); // group index(3) exceeds the group count(2) - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w).(\\w)", "\\3"), cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); + auto prog = cudf::strings::regex_program::create("(\\w).(\\w)"); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, *prog, "\\3"), cudf::logic_error); + prog = cudf::strings::regex_program::create(""); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, *prog, "\\1"), cudf::logic_error); + prog = cudf::strings::regex_program::create("(\\w)"); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, *prog, ""), cudf::logic_error); } TEST_F(StringsReplaceRegexTest, MediumReplaceRegex) @@ -412,6 +375,7 @@ TEST_F(StringsReplaceRegexTest, MediumReplaceRegex) std::string medium_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com"; + auto prog = cudf::strings::regex_program::create(medium_regex); std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -424,7 +388,7 @@ TEST_F(StringsReplaceRegexTest, MediumReplaceRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::replace_re(strings_view, medium_regex); + auto results = cudf::strings::replace_re(strings_view, *prog); std::vector h_expected{ " thats all", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz"}; cudf::test::strings_column_wrapper expected( @@ -440,6 +404,7 @@ TEST_F(StringsReplaceRegexTest, LargeReplaceRegex) std::string large_regex = "hello @abc @def world The (quick) brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz"; + auto prog = cudf::strings::regex_program::create(large_regex); std::vector h_strings{ "zzzz hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " @@ -452,7 +417,7 @@ TEST_F(StringsReplaceRegexTest, LargeReplaceRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::replace_re(strings_view, large_regex); + auto results = cudf::strings::replace_re(strings_view, *prog); std::vector h_expected{ "zzzz ", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz"}; cudf::test::strings_column_wrapper expected( diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 714c1ad416a..7cca564d112 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -394,27 +394,22 @@ TEST_F(StringsSplitTest, SplitRegex) { auto pattern = std::string("\\s+"); - auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); auto expected = cudf::table_view({col0, col1, col2}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_re(sv, pattern); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); result = cudf::strings::rsplit_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { auto pattern = std::string("[eé]"); - auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, @@ -422,14 +417,11 @@ TEST_F(StringsSplitTest, SplitRegex) cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); auto expected = cudf::table_view({col0, col1, col2, col3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_re(sv, pattern); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); result = cudf::strings::rsplit_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } @@ -446,26 +438,21 @@ TEST_F(StringsSplitTest, SplitRecordRegex) using LCW = cudf::test::lists_column_wrapper; { auto pattern = std::string("\\s+"); - auto result = cudf::strings::split_record_re(sv, pattern); LCW expected( {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_record_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_record_re(sv, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); result = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } { auto pattern = std::string("[eé]"); - auto result = cudf::strings::split_record_re(sv, pattern); LCW expected({LCW{" H", "llo th", "s", ""}, LCW{}, @@ -473,14 +460,11 @@ TEST_F(StringsSplitTest, SplitRecordRegex) LCW{"t", "st String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_record_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // rsplit == split when using default parameters - result = cudf::strings::rsplit_record_re(sv, pattern); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); result = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } @@ -495,48 +479,42 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) auto sv = cudf::strings_column_view(input); { auto pattern = std::string("\\s+"); - auto result = cudf::strings::split_re(sv, pattern, 1); cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_re(sv, *prog, 1); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_re(sv, *prog, 1); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); // split everything is the same output as maxsplit==2 for the test input column here - result = cudf::strings::split_re(sv, pattern, 2); - auto expected2 = cudf::strings::split_re(sv, pattern); + result = cudf::strings::split_re(sv, *prog, 2); + auto expected2 = cudf::strings::split_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); result = cudf::strings::split_re(sv, *prog, 3); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); } { auto pattern = std::string("\\s"); - auto result = cudf::strings::split_record_re(sv, pattern, 1); using LCW = cudf::test::lists_column_wrapper; LCW expected1( {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_record_re(sv, *prog, 1); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_record_re(sv, *prog, 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); - result = cudf::strings::split_record_re(sv, pattern, 2); + result = cudf::strings::split_record_re(sv, *prog, 2); LCW expected2( {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); - result = cudf::strings::split_record_re(sv, *prog, 2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); // split everything is the same output as maxsplit==3 for the test input column here - result = cudf::strings::split_record_re(sv, pattern, 3); - auto expected0 = cudf::strings::split_record_re(sv, pattern); + result = cudf::strings::split_record_re(sv, *prog, 3); + auto expected0 = cudf::strings::split_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); result = cudf::strings::split_record_re(sv, *prog, 3); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); @@ -549,7 +527,6 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) auto sv = cudf::strings_column_view(input); { auto pattern = std::string("\\b"); - auto result = cudf::strings::split_re(sv, pattern); cudf::test::strings_column_wrapper col0({"", "", "-+", ""}); cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {1, 1, 0, 1}); @@ -557,20 +534,17 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) cudf::test::strings_column_wrapper col3({"", "", "", "é"}, {0, 0, 0, 1}); cudf::test::strings_column_wrapper col4({"", "", "", ""}, {0, 0, 0, 1}); auto expected = cudf::table_view({col0, col1, col2, col3, col4}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_re(sv, *prog); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { auto pattern = std::string("\\B"); - auto result = cudf::strings::split_record_re(sv, pattern); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"a"}, LCW{"a", "b"}, LCW{"", "-", "+", ""}, LCW{"e\né"}}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - auto prog = cudf::strings::regex_program::create(pattern); - result = cudf::strings::split_record_re(sv, *prog); + auto prog = cudf::strings::regex_program::create(pattern); + auto result = cudf::strings::split_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } } @@ -678,29 +652,23 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) auto prog = cudf::strings::regex_program::create(pattern); { - auto result = cudf::strings::rsplit_re(sv, pattern, 1); - cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_re(sv, *prog, 1); + auto result = cudf::strings::rsplit_re(sv, *prog, 1); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { - auto result = cudf::strings::rsplit_record_re(sv, pattern, 1); - using LCW = cudf::test::lists_column_wrapper; LCW expected( {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_record_re(sv, *prog, 1); + auto result = cudf::strings::rsplit_record_re(sv, *prog, 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); // split everything is the same output as any maxsplit > 2 for the test input column here - result = cudf::strings::rsplit_record_re(sv, pattern, 3); - auto expected0 = cudf::strings::rsplit_record_re(sv, pattern); + result = cudf::strings::rsplit_record_re(sv, *prog, 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); result = cudf::strings::rsplit_record_re(sv, *prog, 3); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); @@ -711,16 +679,17 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto prog = cudf::strings::regex_program::create("\\s"); auto results = cudf::strings::split(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); results = cudf::strings::rsplit(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::split_re(zero_size_strings_column, "\\s"); + results = cudf::strings::split_re(zero_size_strings_column, *prog); EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s"); + results = cudf::strings::rsplit_re(zero_size_strings_column, *prog); EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); @@ -728,9 +697,9 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) EXPECT_TRUE(list_result->size() == 0); list_result = cudf::strings::rsplit_record(zero_size_strings_column); EXPECT_TRUE(list_result->size() == 0); - list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + list_result = cudf::strings::split_record_re(zero_size_strings_column, *prog); EXPECT_TRUE(list_result->size() == 0); - list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, *prog); EXPECT_TRUE(list_result->size() == 0); } @@ -738,7 +707,8 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) TEST_F(StringsSplitTest, AllNullsCase) { cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0}); - auto sv = cudf::strings_column_view(input); + auto sv = cudf::strings_column_view(input); + auto prog = cudf::strings::regex_program::create("-"); auto results = cudf::strings::split(sv); EXPECT_TRUE(results->num_columns() == 1); @@ -752,10 +722,10 @@ TEST_F(StringsSplitTest, AllNullsCase) results = cudf::strings::rsplit(sv, cudf::string_scalar("-")); EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); - results = cudf::strings::split_re(sv, "-"); + results = cudf::strings::split_re(sv, *prog); EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); - results = cudf::strings::rsplit_re(sv, "-"); + results = cudf::strings::rsplit_re(sv, *prog); EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); @@ -765,9 +735,9 @@ TEST_F(StringsSplitTest, AllNullsCase) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); list_result = cudf::strings::rsplit_record(sv); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); - list_result = cudf::strings::split_record_re(sv, "-"); + list_result = cudf::strings::split_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); - list_result = cudf::strings::rsplit_record_re(sv, "-"); + list_result = cudf::strings::rsplit_record_re(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); } @@ -945,6 +915,7 @@ TEST_F(StringsSplitTest, InvalidParameter) { cudf::test::strings_column_wrapper input({"string left intentionally blank"}); auto strings_view = cudf::strings_column_view(input); + auto prog = cudf::strings::regex_program::create(""); EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), @@ -953,10 +924,10 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), cudf::logic_error); - EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); - EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); - EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error); - EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_re(strings_view, *prog), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record_re(strings_view, *prog), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_re(strings_view, *prog), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, *prog), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),