From f5f63eb0fce482a9e9438446ae3053945b2a77e0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 4 Apr 2022 11:54:13 -0400 Subject: [PATCH 1/6] Refactor cudf::strings::count_re API to use count_matches utility --- cpp/src/strings/contains.cu | 73 +++++------------------- cpp/src/strings/count_matches.cu | 24 ++++---- cpp/src/strings/count_matches.hpp | 2 + cpp/src/strings/extract/extract_all.cu | 2 +- cpp/src/strings/search/findall.cu | 3 +- cpp/src/strings/search/findall_record.cu | 2 +- cpp/src/strings/split/split_re.cu | 4 +- cpp/tests/strings/contains_tests.cpp | 16 ++---- 8 files changed, 37 insertions(+), 89 deletions(-) diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 773430953c9..452c8411edd 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -137,71 +138,25 @@ std::unique_ptr matches_re(strings_column_view const& strings, } namespace detail { -namespace { -/** - * @brief This counts the number of times the regex pattern matches in each string. - */ -template -struct count_fn { - reprog_device prog; - column_device_view const d_strings; - - __device__ int32_t operator()(unsigned int idx) - { - if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); - auto const nchars = d_str.length(); - int32_t find_count = 0; - int32_t begin = 0; - while (begin < nchars) { - auto end = static_cast(nchars); - if (prog.find(idx, d_str, begin, end) <= 0) break; - ++find_count; - begin = end > begin ? end : begin + 1; - } - return find_count; - } -}; - -struct count_dispatch_fn { - reprog_device d_prog; - template - std::unique_ptr operator()(strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto results = make_numeric_column(data_type{type_id::INT32}, - input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), - stream, - mr); - - auto const d_strings = column_device_view::create(input.parent(), stream); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - results->mutable_view().data(), - count_fn{d_prog, *d_strings}); - return results; - } -}; - -} // namespace - -std::unique_ptr count_re( - strings_column_view const& input, - std::string const& pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_re(strings_column_view const& input, + std::string const& pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // compile regex into device object auto d_prog = reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream); - return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr); + auto const d_strings = column_device_view::create(input.parent(), stream); + + auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr); + if (input.has_nulls()) { + result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count()); + } + return result; } } // namespace detail diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu index 5057df7f92b..1beef64d324 100644 --- a/cpp/src/strings/count_matches.cu +++ b/cpp/src/strings/count_matches.cu @@ -43,15 +43,16 @@ struct count_matches_fn { __device__ size_type operator()(size_type idx) { if (d_strings.is_null(idx)) { return 0; } - size_type count = 0; - auto const d_str = d_strings.element(idx); + size_type count = 0; + auto const d_str = d_strings.element(idx); + auto const nchars = d_str.length(); int32_t begin = 0; - int32_t end = d_str.length(); + int32_t end = nchars; while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { ++count; begin = end + (begin == end); - end = d_str.length(); + end = nchars; } return count; } @@ -62,11 +63,12 @@ struct count_dispatch_fn { template std::unique_ptr operator()(column_device_view const& d_strings, + size_type output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto results = make_numeric_column( - data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -80,21 +82,15 @@ struct count_dispatch_fn { } // namespace /** - * @brief Returns a column of regex match counts for each string in the given column. - * - * A null entry will result in a zero count for that output row. - * - * @param d_strings Device view of the input strings column. - * @param d_prog Regex instance to evaluate on each string. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @copydoc cudf::strings::detail::count_matches */ std::unique_ptr count_matches(column_device_view const& d_strings, reprog_device const& d_prog, + size_type output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, stream, mr); + return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, output_size, stream, mr); } } // namespace detail diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp index 1339f2b1ebd..efff3958c65 100644 --- a/cpp/src/strings/count_matches.hpp +++ b/cpp/src/strings/count_matches.hpp @@ -36,12 +36,14 @@ class reprog_device; * * @param d_strings Device view of the input strings column. * @param d_prog Regex instance to evaluate on each string. + * @param output_size Number of rows for the output column. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ std::unique_ptr count_matches( column_device_view const& d_strings, reprog_device const& d_prog, + size_type output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index fd2d280c5bc..e679b036612 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -137,7 +137,7 @@ std::unique_ptr extract_all_record( // Get the match counts for each string. // This column will become the output lists child offsets column. - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); auto d_offsets = offsets->mutable_view().data(); // Compute null output rows diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index e874d1db192..8bbe376c137 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -126,8 +126,7 @@ std::unique_ptr findall( reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); auto const regex_insts = d_prog->insts_counts(); - auto find_counts = - count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto find_counts = count_matches(*d_strings, *d_prog, input.size() + 1, stream); auto d_find_counts = find_counts->mutable_view().data(); std::vector> results; diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu index 7fb5982b307..dff6d1fc422 100644 --- a/cpp/src/strings/search/findall_record.cu +++ b/cpp/src/strings/search/findall_record.cu @@ -117,7 +117,7 @@ std::unique_ptr findall_record( reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); // Create lists offsets column - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); auto d_offsets = offsets->mutable_view().data(); // Convert counts into offsets diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 286492e53c5..0d50b35b3f6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -225,7 +225,7 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream); auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); @@ -287,7 +287,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); auto offsets_view = offsets->mutable_view(); // get the split tokens from the input column; this also converts the counts into offsets diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 4015c36b283..cb95643d38a 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -455,11 +455,9 @@ TEST_F(StringsContainsTests, MediumRegex) { auto results = cudf::strings::count_re(strings_view, medium_regex); int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + cudf::test::fixed_width_column_wrapper expected(h_expected, + h_expected + h_strings.size()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } @@ -504,11 +502,9 @@ TEST_F(StringsContainsTests, LargeRegex) { auto results = cudf::strings::count_re(strings_view, large_regex); int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + cudf::test::fixed_width_column_wrapper expected(h_expected, + h_expected + h_strings.size()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } From cd06bbc9acddc77de0f4159d668ee755652320dd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 4 Apr 2022 18:10:35 -0400 Subject: [PATCH 2/6] add check for output-size at least the size of the input column --- cpp/src/strings/count_matches.cu | 2 ++ cpp/src/strings/count_matches.hpp | 2 ++ cpp/src/strings/extract/extract_all.cu | 2 +- cpp/src/strings/search/findall.cu | 2 +- cpp/src/strings/search/findall_record.cu | 2 +- cpp/src/strings/split/split_re.cu | 2 +- 6 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu index 1beef64d324..b8a3d427337 100644 --- a/cpp/src/strings/count_matches.cu +++ b/cpp/src/strings/count_matches.cu @@ -67,6 +67,8 @@ struct count_dispatch_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(output_size >= d_strings.size(), "Unexpected output size"); + auto results = make_numeric_column( data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp index efff3958c65..b74e301f351 100644 --- a/cpp/src/strings/count_matches.hpp +++ b/cpp/src/strings/count_matches.hpp @@ -34,6 +34,8 @@ class reprog_device; * * A null entry will result in a zero count for that output row. * + * @throw cudf::logic_error if `output_size < d_strings.size()` + * * @param d_strings Device view of the input strings column. * @param d_prog Regex instance to evaluate on each string. * @param output_size Number of rows for the output column. diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index e679b036612..7dce369a24f 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -137,7 +137,7 @@ std::unique_ptr extract_all_record( // Get the match counts for each string. // This column will become the output lists child offsets column. - auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); auto d_offsets = offsets->mutable_view().data(); // Compute null output rows diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 8bbe376c137..af06cf51219 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -126,7 +126,7 @@ std::unique_ptr
findall( reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); auto const regex_insts = d_prog->insts_counts(); - auto find_counts = count_matches(*d_strings, *d_prog, input.size() + 1, stream); + auto find_counts = count_matches(*d_strings, *d_prog, strings_count + 1, stream); auto d_find_counts = find_counts->mutable_view().data(); std::vector> results; diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu index dff6d1fc422..46155bd7cf5 100644 --- a/cpp/src/strings/search/findall_record.cu +++ b/cpp/src/strings/search/findall_record.cu @@ -117,7 +117,7 @@ std::unique_ptr findall_record( reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); // Create lists offsets column - auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); auto d_offsets = offsets->mutable_view().data(); // Convert counts into offsets diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 0d50b35b3f6..2823ddb6bbf 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -287,7 +287,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); auto offsets_view = offsets->mutable_view(); // get the split tokens from the input column; this also converts the counts into offsets From aa1be9058cb4a96fc837b1d6e1834643c85f0a83 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Apr 2022 12:10:25 -0400 Subject: [PATCH 3/6] use count variable already available --- cpp/src/strings/split/split_re.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 2823ddb6bbf..3ec6df058c6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -225,7 +225,7 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, input.size() + 1, stream); + auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream); auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); From 99987c5f541a4a871d41b9f3a3d44ed9bbf9fbaa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Apr 2022 17:48:58 -0400 Subject: [PATCH 4/6] moved lonely detail fn to existing detail ns --- cpp/src/strings/contains.cu | 44 ++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 452c8411edd..c4ffa7f0fb1 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -115,30 +115,6 @@ std::unique_ptr matches_re( return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr); } -} // namespace detail - -// external APIs - -std::unique_ptr contains_re(strings_column_view const& strings, - std::string const& pattern, - regex_flags const flags, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::contains_re(strings, pattern, flags, rmm::cuda_stream_default, mr); -} - -std::unique_ptr matches_re(strings_column_view const& strings, - std::string const& pattern, - regex_flags const flags, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr); -} - -namespace detail { - std::unique_ptr count_re(strings_column_view const& input, std::string const& pattern, regex_flags const flags, @@ -161,7 +137,25 @@ std::unique_ptr count_re(strings_column_view const& input, } // namespace detail -// external API +// external APIs + +std::unique_ptr contains_re(strings_column_view const& strings, + std::string const& pattern, + regex_flags const flags, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_re(strings, pattern, flags, rmm::cuda_stream_default, mr); +} + +std::unique_ptr matches_re(strings_column_view const& strings, + std::string const& pattern, + regex_flags const flags, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr); +} std::unique_ptr count_re(strings_column_view const& strings, std::string const& pattern, From 6b3d1525b57749a8e3c271708a28d48449d202f7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 8 Apr 2022 14:44:47 -0400 Subject: [PATCH 5/6] change CUDF_FAIL to cudf_assert for count_matches detail function --- cpp/src/strings/count_matches.cu | 2 +- cpp/src/strings/count_matches.hpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu index b8a3d427337..df885a21e2c 100644 --- a/cpp/src/strings/count_matches.cu +++ b/cpp/src/strings/count_matches.cu @@ -67,7 +67,7 @@ struct count_dispatch_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(output_size >= d_strings.size(), "Unexpected output size"); + cudf_assert(output_size >= d_strings.size() and "Unexpected output size"); auto results = make_numeric_column( data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp index b74e301f351..efff3958c65 100644 --- a/cpp/src/strings/count_matches.hpp +++ b/cpp/src/strings/count_matches.hpp @@ -34,8 +34,6 @@ class reprog_device; * * A null entry will result in a zero count for that output row. * - * @throw cudf::logic_error if `output_size < d_strings.size()` - * * @param d_strings Device view of the input strings column. * @param d_prog Regex instance to evaluate on each string. * @param output_size Number of rows for the output column. From 01aa70f2b42998d813ecda48285132842e5ccb92 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 8 Apr 2022 17:29:19 -0400 Subject: [PATCH 6/6] changed cudf_assert to assert --- cpp/src/strings/count_matches.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu index df885a21e2c..a850315dfec 100644 --- a/cpp/src/strings/count_matches.cu +++ b/cpp/src/strings/count_matches.cu @@ -67,7 +67,7 @@ struct count_dispatch_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - cudf_assert(output_size >= d_strings.size() and "Unexpected output size"); + assert(output_size >= d_strings.size() and "Unexpected output size"); auto results = make_numeric_column( data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);