rapidsai · davidwendt · May 6, 2022 · May 6, 2022 · May 6, 2022 · May 9, 2022
@@ -232,6 +232,7 @@ outputs:
         - test -f $PREFIX/include/cudf/strings/json.hpp
         - test -f $PREFIX/include/cudf/strings/padding.hpp
         - test -f $PREFIX/include/cudf/strings/regex/flags.hpp
+        - test -f $PREFIX/include/cudf/strings/regex/config.hpp
         - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
         - test -f $PREFIX/include/cudf/strings/replace.hpp
         - test -f $PREFIX/include/cudf/strings/replace_re.hpp

@@ -492,6 +492,7 @@ add_library(
   src/strings/filter_chars.cu
   src/strings/padding.cu
   src/strings/json/json_path.cu
+  src/strings/regex/config.cpp
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cu
   src/strings/repeat_strings.cu

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/regex/flags.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf::strings {
+
+/**
+ * @addtogroup strings_regex
+ * @{
+ */
+
+/**
+ * @brief Compute the working memory size for evaluating a regex pattern
+ * on a given strings column.
+ *
+ * This function returns the size in bytes of the memory needed to evaluate
+ * the given regex pattern in parallel over the returned output rows.
+ * The number of output rows will be less than or equal to the size of the
+ * input column.
+ *
+ * This function computes only the state data memory size required to process
+ * a regex pattern over the output row count.
+ * Specific functions that use regex may require additional working memory
+ * unrelated to the regex processing.
+ *
+ * @param input Strings instance
+ * @param pattern Regex pattern to be used
+ * @param flags Regex flags for interpreting special characters in the pattern
+ * @return Size of the state memory in bytes required for processing `pattern` on `strings`
+ *         and the number of concurrent rows this memory will support
+ */
+std::pair<std::size_t, size_type> compute_regex_state_memory(
+  strings_column_view const& input,
+  std::string_view pattern,
+  regex_flags const flags = regex_flags::DEFAULT);
+
+/** @} */  // end of doxygen group
+
+}  // namespace cudf::strings
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ namespace cudf {
 namespace strings {
 
 /**
- * @addtogroup strings_contains
+ * @addtogroup strings_regex
  * @{
  */
 

@@ -129,6 +129,7 @@
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
  *   @defgroup strings_json JSON
+ *   @defgroup strings_regex Regex Config
  * @}
  * @defgroup dictionary_apis Dictionary
  * @{

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "regex.cuh"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/regex/config.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::strings {
+namespace detail {
+
+std::pair<std::size_t, size_type> compute_regex_state_memory(strings_column_view const& input,
+                                                             std::string_view pattern,
+                                                             regex_flags const flags,
+                                                             rmm::cuda_stream_view stream)
+{
+  auto const d_prog = reprog_device::create(pattern, flags, stream);
+  return d_prog->compute_strided_working_memory(input.size());
+}
+
+}  // namespace detail
+
+std::pair<std::size_t, size_type> compute_regex_state_memory(strings_column_view const& input,
+                                                             std::string_view pattern,
+                                                             regex_flags const flags)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_regex_state_memory(input, pattern, flags, rmm::cuda_stream_default);
+}
+
+}  // namespace cudf::strings
@@ -152,7 +152,7 @@ std::pair<std::size_t, int32_t> reprog_device::compute_strided_working_memory(
     thread_count = min_rows;
     buffer_size  = working_memory_size(thread_count);
   }
-  return std::make_pair(buffer_size, thread_count);
+  return std::pair(buffer_size, thread_count);
 }
 
 void reprog_device::set_working_memory(void* buffer, int32_t thread_count, int32_t max_insts)

@@ -140,7 +140,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
       size_and_exec_fn, d_prog, strings_count);
   }
 
-  return std::make_pair(std::move(offsets), std::move(chars));
+  return std::pair(std::move(offsets), std::move(chars));
 }
 
 }  // namespace detail

@@ -406,6 +406,7 @@ ConfigureTest(
   strings/ipv4_tests.cpp
   strings/json_tests.cpp
   strings/pad_tests.cpp
+  strings/regex_config_tests.cpp
   strings/repeat_strings_tests.cpp
   strings/replace_regex_tests.cpp
   strings/replace_tests.cpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/regex/config.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <string>
+
+struct StringsRegexConfigTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StringsRegexConfigTest, Basic)
+{
+  cudf::test::strings_column_wrapper input({"abc", "", "defghijk", "lmnop", "", "qrstuvwxyz"},
+                                           {1, 1, 1, 1, 0, 1});
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::compute_regex_state_memory(sv, "hello");
+  EXPECT_EQ(results.first, 736);
+  EXPECT_EQ(results.second, sv.size());
+
+  results = cudf::strings::compute_regex_state_memory(sv, "");
+  EXPECT_EQ(results.first, 160);
+  EXPECT_EQ(results.second, sv.size());
+}
+
+TEST_F(StringsRegexConfigTest, Large)
+{
+  auto const d_chars   = rmm::device_uvector<char>{0, rmm::cuda_stream_default};
+  auto const d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(16000001);
+  auto const d_nulls   = rmm::device_uvector<cudf::bitmask_type>{0, rmm::cuda_stream_default};
+  auto const input     = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
+  auto const sv        = cudf::strings_column_view(input->view());
+
+  std::string pattern =
+    "a very large regular expression pattern whose contents do not really matter as much as the "
+    "length does";
+
+  auto results = cudf::strings::compute_regex_state_memory(sv, pattern);
+  EXPECT_EQ(results.first, 8344000000);
+  EXPECT_EQ(results.second, sv.size() / 4);
+}
+
+TEST_F(StringsRegexConfigTest, Empty)
+{
+  auto empty_col = cudf::make_empty_column(cudf::type_id::STRING);
+  auto sv        = cudf::strings_column_view(empty_col->view());
+
+  auto results = cudf::strings::compute_regex_state_memory(sv, "a");
+  EXPECT_EQ(results.first, 0);
+  EXPECT_EQ(results.second, sv.size());
+}