diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index d4f0d110e3c..5d2d2c96eb5 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -232,6 +232,7 @@ outputs: - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/regex/flags.hpp + - test -f $PREFIX/include/cudf/strings/regex/config.hpp - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7a033509bbe..f84fab40ac6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -492,6 +492,7 @@ add_library( src/strings/filter_chars.cu src/strings/padding.cu src/strings/json/json_path.cu + src/strings/regex/config.cpp src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu src/strings/repeat_strings.cu diff --git a/cpp/include/cudf/strings/regex/config.hpp b/cpp/include/cudf/strings/regex/config.hpp new file mode 100644 index 00000000000..505b1cf2dc6 --- /dev/null +++ b/cpp/include/cudf/strings/regex/config.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf::strings { + +/** + * @addtogroup strings_regex + * @{ + */ + +/** + * @brief Compute the working memory size for evaluating a regex pattern + * on a given strings column. + * + * This function returns the size in bytes of the memory needed to evaluate + * the given regex pattern in parallel over the returned output rows. + * The number of output rows will be less than or equal to the size of the + * input column. + * + * This function computes only the state data memory size required to process + * a regex pattern over the output row count. + * Specific functions that use regex may require additional working memory + * unrelated to the regex processing. + * + * @param input Strings instance + * @param pattern Regex pattern to be used + * @param flags Regex flags for interpreting special characters in the pattern + * @return Size of the state memory in bytes required for processing `pattern` on `strings` + * and the number of concurrent rows this memory will support + */ +std::pair compute_regex_state_memory( + strings_column_view const& input, + std::string_view pattern, + regex_flags const flags = regex_flags::DEFAULT); + +/** @} */ // end of doxygen group + +} // namespace cudf::strings diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index 637b3b0851b..cc9003de82d 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ namespace cudf { namespace strings { /** - * @addtogroup strings_contains + * @addtogroup strings_regex * @{ */ diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 0abaebc3b0c..eddf672b123 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -129,6 +129,7 @@ * @defgroup strings_replace Replacing * @defgroup strings_split Splitting * @defgroup strings_json JSON + * @defgroup strings_regex Regex Config * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/strings/regex/config.cpp b/cpp/src/strings/regex/config.cpp new file mode 100644 index 00000000000..ed732bb21bd --- /dev/null +++ b/cpp/src/strings/regex/config.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "regex.cuh" + +#include +#include + +#include + +namespace cudf::strings { +namespace detail { + +std::pair compute_regex_state_memory(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream) +{ + auto const d_prog = reprog_device::create(pattern, flags, stream); + return d_prog->compute_strided_working_memory(input.size()); +} + +} // namespace detail + +std::pair compute_regex_state_memory(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags) +{ + CUDF_FUNC_RANGE(); + return detail::compute_regex_state_memory(input, pattern, flags, rmm::cuda_stream_default); +} + +} // namespace cudf::strings diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 16f5b6fa03d..f4ab28a9232 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -152,7 +152,7 @@ std::pair reprog_device::compute_strided_working_memory( thread_count = min_rows; buffer_size = working_memory_size(thread_count); } - return std::make_pair(buffer_size, thread_count); + return std::pair(buffer_size, thread_count); } void reprog_device::set_working_memory(void* buffer, int32_t thread_count, int32_t max_insts) diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 9a80be25b3b..910907ee4d4 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -140,7 +140,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, size_and_exec_fn, d_prog, strings_count); } - return std::make_pair(std::move(offsets), std::move(chars)); + return std::pair(std::move(offsets), std::move(chars)); } } // namespace detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 816c5a1c59c..d122d3e19f3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -406,6 +406,7 @@ ConfigureTest( strings/ipv4_tests.cpp strings/json_tests.cpp strings/pad_tests.cpp + strings/regex_config_tests.cpp strings/repeat_strings_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp diff --git a/cpp/tests/strings/regex_config_tests.cpp b/cpp/tests/strings/regex_config_tests.cpp new file mode 100644 index 00000000000..dace584bad9 --- /dev/null +++ b/cpp/tests/strings/regex_config_tests.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +struct StringsRegexConfigTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsRegexConfigTest, Basic) +{ + cudf::test::strings_column_wrapper input({"abc", "", "defghijk", "lmnop", "", "qrstuvwxyz"}, + {1, 1, 1, 1, 0, 1}); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::compute_regex_state_memory(sv, "hello"); + EXPECT_EQ(results.first, 736); + EXPECT_EQ(results.second, sv.size()); + + results = cudf::strings::compute_regex_state_memory(sv, ""); + EXPECT_EQ(results.first, 160); + EXPECT_EQ(results.second, sv.size()); +} + +TEST_F(StringsRegexConfigTest, Large) +{ + auto const d_chars = rmm::device_uvector{0, rmm::cuda_stream_default}; + auto const d_offsets = cudf::detail::make_zeroed_device_uvector_sync(16000001); + auto const d_nulls = rmm::device_uvector{0, rmm::cuda_stream_default}; + auto const input = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0); + auto const sv = cudf::strings_column_view(input->view()); + + std::string pattern = + "a very large regular expression pattern whose contents do not really matter as much as the " + "length does"; + + auto results = cudf::strings::compute_regex_state_memory(sv, pattern); + EXPECT_EQ(results.first, 8344000000); + EXPECT_EQ(results.second, sv.size() / 4); +} + +TEST_F(StringsRegexConfigTest, Empty) +{ + auto empty_col = cudf::make_empty_column(cudf::type_id::STRING); + auto sv = cudf::strings_column_view(empty_col->view()); + + auto results = cudf::strings::compute_regex_state_memory(sv, "a"); + EXPECT_EQ(results.first, 0); + EXPECT_EQ(results.second, sv.size()); +}