rapidsai · rapids-bot · Feb 11, 2022 · Jan 26, 2022 · Jan 26, 2022 · Jan 27, 2022
@@ -216,6 +216,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/split/split_re.hpp
     - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp

@@ -445,6 +445,7 @@ add_library(
   src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
+  src/strings/split/split_re.cu
   src/strings/split/split_record.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp

@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_split
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = split_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex pattern to delimit each string starting from the end of the string.
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = rsplit_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_record_re(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record_re(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex pattern to delimit each string starting from the end of the string.
+ *
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_record_re(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record_re(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf