diff --git a/CHANGELOG.md b/CHANGELOG.md index 98583038647..c7ed843e79e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ ## Improvements - PR #3292 Port NVStrings regex contains function +- PR #3409 Port NVStrings regex replace function - PR #3417 Port NVStrings regex findall function - PR #3351 Add warning when filepath resolves to multiple files in cudf readers - PR #3370 Port NVStrings strip functions diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c76cdb38650..836ea404e9c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -583,6 +583,9 @@ add_library(cudf src/strings/padding.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu + src/strings/replace/replace_re.cu + src/strings/replace/backref_re.cu + src/strings/replace/multi_re.cu src/strings/replace/replace.cu src/strings/sorting/sorting.cu src/strings/split/partition.cu diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp new file mode 100644 index 00000000000..7d2a39a505e --- /dev/null +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf +{ +namespace strings +{ + +/** + * @brief For each string, replaces any character sequence matching the given pattern + * with the provided replacement string. + * + * Any null string entries return corresponding null output column entries. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern to search within each string. + * @param repl The string used to replace the matched sequence in each string. + * Default is an empty string. + * @param maxrepl The maximum number of times to replace the matched pattern within each string. + * @param mr Resource for allocating device memory. + * @return New strings column. + */ +std::unique_ptr replace_re( strings_column_view const& strings, + std::string const& pattern, + string_scalar const& repl = string_scalar(""), + size_type maxrepl = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief For each string, replaces any character sequence matching the given patterns + * with the corresponding string in the repls column. + * + * Any null string entries return corresponding null output column entries. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression patterns to search within each string. + * @param repls The strings used for replacement. + * @param mr Resource for allocating device memory. + * @return New strings column. + */ +std::unique_ptr replace_re( strings_column_view const& strings, + std::vector const& patterns, + strings_column_view const& repls, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief For each string, replaces any character sequence matching the given pattern + * using the repl template for back-references. + * + * Any null string entries return corresponding null output column entries. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression patterns to search within each string. + * @param repl The replacement template for creating the output string. + * @param mr Resource for allocating device memory. + * @return New strings column. + */ +std::unique_ptr replace_with_backrefs( strings_column_view const& strings, + std::string const& pattern, + std::string const& repl, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu new file mode 100644 index 00000000000..fcaa3c7fb53 --- /dev/null +++ b/cpp/src/strings/replace/backref_re.cu @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf +{ +namespace strings +{ +namespace detail +{ +namespace +{ + +using backref_type = thrust::pair; + +/** + * @brief Parse the back-ref index and position values from a given replace format. + * + * The backref numbers are expected to be 1-based. + * + * Returns a modified string without back-ref indicators. + * ``` + * Example: + * for input string: 'hello \2 and \1' + * the returned pairs: (2,6),(1,11) + * returned string is: 'hello and ' + * ``` + */ +std::string parse_backrefs( std::string const& repl, std::vector& backrefs ) +{ + std::string str = repl; // make a modifiable copy + std::smatch m; + std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1" + std::string rtn; // result without refs + size_type byte_offset = 0; + while( std::regex_search( str, m, ex ) ) + { + if( m.size()==0 ) + break; + backref_type item; + std::string bref = m[0]; + size_type position = static_cast(m.position(0)); + size_type length = static_cast(bref.length()); + byte_offset += position; + item.first = std::atoi(bref.c_str()+1); // back-ref index number + CUDF_EXPECTS( item.first > 0, "Back-reference numbers must be greater than 0"); + item.second = byte_offset; // position within the string + rtn += str.substr(0,position); + str = str.substr(position + length); + backrefs.push_back(item); + } + if( !str.empty() ) // add the remainder + rtn += str; // of the string + return rtn; +} + + +/** + * @brief This functor handles replacing strings by applying the compiled regex pattern + * and inserting the at the backref position indicated in the replacement template. + * + * The logic includes computing the size of each string and also writing the output. + * + * The stack is used to keep progress on evaluating the regex instructions on each string. + * So the size of the stack is in proportion to the number of instructions in the given regex pattern. + * + * There are three call types based on the number of regex instructions in the given pattern. + * Small to medium instruction lengths can use the stack effectively though smaller executes faster. + * Longer patterns require global memory. Shorter patterns are common in data cleaning. + * + */ +template +struct backrefs_fn +{ + column_device_view const d_strings; + reprog_device prog; + string_view const d_repl; // string replacement template + rmm::device_vector::iterator backrefs_begin; + rmm::device_vector::iterator backrefs_end; + const int32_t* d_offsets{}; // these are null when + char* d_chars{}; // only computing size + + __device__ size_type operator()(size_type idx) + { + if( d_strings.is_null(idx) ) + return 0; + u_char data1[stack_size]; + u_char data2[stack_size]; + prog.set_stack_mem(data1,data2); + string_view d_str = d_strings.element(idx); + auto nchars = d_str.length(); // number of characters in input string + auto nbytes = d_str.size_bytes(); // number of bytes in input string + const char* in_ptr = d_str.data(); + char* out_ptr = d_offsets ? (d_chars + d_offsets[idx]) : nullptr; + size_type lpos = 0; // last byte position processed in d_str + size_type begin = 0; // first character position matching regex + size_type end = nchars; // last character position (exclusive) + // copy input to output replacing strings as we go + while( prog.find(idx,d_str,begin,end) > 0 ) // inits the begin/end vars + { + auto spos = d_str.byte_offset(begin); // get offset for these + auto epos = d_str.byte_offset(end); // character position values + nbytes += d_repl.size_bytes() - (epos - spos); // compute new size + if( out_ptr ) + out_ptr = copy_and_increment(out_ptr,in_ptr+lpos,spos-lpos); + size_type lpos_template = 0; // last end pos of replace template + auto repl_ptr = d_repl.data(); // replace template pattern + thrust::for_each( thrust::seq, backrefs_begin, backrefs_end, + [&] __device__ (backref_type backref) + { + if( out_ptr ) + { + auto const copy_length = backref.second - lpos_template; + out_ptr = copy_and_increment(out_ptr, repl_ptr, copy_length ); + repl_ptr += copy_length; + lpos_template += copy_length; + } + // extract the specific group's string for this backref's index + size_type spos_extract = begin; // these are modified + size_type epos_extract = end; // by extract() + if( (prog.extract(idx,d_str,spos_extract,epos_extract,backref.first-1)<=0 ) || + (epos_extract <= spos_extract) ) + return; // no value for this backref number; that is ok + spos_extract = d_str.byte_offset(spos_extract); // convert + epos_extract = d_str.byte_offset(epos_extract); // to bytes + nbytes += epos_extract - spos_extract; + if( out_ptr ) + out_ptr = copy_and_increment(out_ptr, d_str.data()+spos_extract, (epos_extract-spos_extract)); + }); + if( out_ptr && (lpos_template < d_repl.size_bytes()) )// copy remainder of template + out_ptr = copy_and_increment(out_ptr, repl_ptr+lpos_template, d_repl.size_bytes() - lpos_template); + lpos = epos; + begin = end; + end = nchars; + } + if( out_ptr && (lpos < d_str.size_bytes()) ) // copy remainder of input string + memcpy(out_ptr, in_ptr+lpos, d_str.size_bytes()-lpos ); + return nbytes; + } +}; + +} // namespace + +// +std::unique_ptr replace_with_backrefs( strings_column_view const& strings, + std::string const& pattern, + std::string const& repl, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + if( strings_count==0 ) + return make_empty_strings_column(mr,stream); + + CUDF_EXPECTS( !pattern.empty(), "Parameter pattern must not be empty"); + CUDF_EXPECTS( !repl.empty(), "Parameter repl must not be empty"); + + auto strings_column = column_device_view::create(strings.parent(),stream); + auto d_strings = *strings_column; + // compile regex into device object + auto prog = reprog_device::create(pattern,get_character_flags_table(),strings_count,stream); + auto d_prog = *prog; + auto regex_insts = d_prog.insts_counts(); + + // parse the repl string for backref indicators + std::vector h_backrefs; + std::string repl_template = parse_backrefs(repl,h_backrefs); + rmm::device_vector backrefs(h_backrefs); + string_scalar repl_scalar(repl_template); + string_view d_repl_template{ repl_scalar.data(), repl_scalar.size() }; + + // copy null mask + auto null_mask = copy_bitmask(strings.parent()); + auto null_count = strings.null_count(); + + // create child columns + std::pair< std::unique_ptr, std::unique_ptr > children(nullptr,nullptr); + // Each invocation is predicated on the stack size which is dependent on the number of regex instructions + if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) ) + children = make_strings_children(backrefs_fn{d_strings,d_prog,d_repl_template, + backrefs.begin(), backrefs.end()}, + strings_count, null_count, mr, stream); + else if( regex_insts <= RX_MEDIUM_INSTS ) + children = make_strings_children(backrefs_fn{d_strings,d_prog,d_repl_template, + backrefs.begin(), backrefs.end()}, + strings_count, null_count, mr, stream); + else + children = make_strings_children(backrefs_fn{d_strings,d_prog,d_repl_template, + backrefs.begin(), backrefs.end()}, + strings_count, null_count, mr, stream); + // + return make_strings_column(strings_count, std::move(children.first), std::move(children.second), + null_count, std::move(null_mask), stream, mr); +} + +} // namespace detail + +// external API + +std::unique_ptr replace_with_backrefs( strings_column_view const& strings, + std::string const& pattern, + std::string const& repl, + rmm::mr::device_memory_resource* mr ) +{ + return detail::replace_with_backrefs(strings, pattern, repl, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu new file mode 100644 index 00000000000..fc9b1841fdd --- /dev/null +++ b/cpp/src/strings/replace/multi_re.cu @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace cudf +{ +namespace strings +{ +namespace detail +{ +namespace +{ + +/** + * @brief This functor handles replacing strings by applying the compiled regex patterns + * and inserting the corresponding new string within the matched range of characters. + * + * The logic includes computing the size of each string and also writing the output. + * + * The stack is used to keep progress on evaluating the regex instructions on each string. + * So the size of the stack is in proportion to the number of instructions in the given regex pattern. + * + * There are three call types based on the number of regex instructions in the given pattern. + * Small to medium instruction lengths can use the stack effectively though smaller executes faster. + * Longer patterns require global memory. Shorter patterns are common in data cleaning. + * + */ +template +struct replace_multi_regex_fn +{ + column_device_view const d_strings; + reprog_device* progs; // array of regex progs + size_type number_of_patterns; + column_device_view const d_repls; // replacment strings + const int32_t* d_offsets{}; // these are null when + char* d_chars{}; // only computing size + + __device__ size_type operator()(size_type idx) + { + if( d_strings.is_null(idx) ) + return 0; + u_char data1[stack_size]; + u_char data2[stack_size]; + string_view d_str = d_strings.element(idx); + auto nchars = d_str.length(); // number of characters in input string + auto nbytes = d_str.size_bytes(); // number of bytes in input string + const char* in_ptr = d_str.data(); // input pointer (i) + char* out_ptr = nullptr; // running output pointer (o) + if( d_offsets ) + out_ptr = d_chars + d_offsets[idx]; + size_type lpos = 0; + size_type ch_pos = 0; + while( ch_pos < nchars ) + { + for( size_type ptn_idx=0; ptn_idx < number_of_patterns; ++ptn_idx ) + { + reprog_device prog = progs[ptn_idx]; + prog.set_stack_mem(data1,data2); + size_type begin = ch_pos, end = ch_pos+1; + if( prog.find(idx,d_str,begin,end) > 0 ) + { + string_view d_repl = d_repls.size() > 1 ? + d_repls.element(ptn_idx) : + d_repls.element(0); + auto spos = d_str.byte_offset(begin); + auto epos = d_str.byte_offset(end); + nbytes += d_repl.size_bytes() - (epos - spos); + if( out_ptr ) + { + out_ptr = copy_and_increment(out_ptr,in_ptr+lpos,spos-lpos); + out_ptr = copy_string(out_ptr, d_repl); + lpos = epos; + } + ch_pos = end - 1; + break; // go to next character position + } + } + ++ch_pos; + } + if( out_ptr ) // copy the remainder + memcpy(out_ptr, in_ptr+lpos, d_str.size_bytes()-lpos); + return nbytes; + } +}; + +} // namespace + +// +std::unique_ptr replace_re( strings_column_view const& strings, + std::vector const& patterns, + strings_column_view const& repls, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + if( strings_count==0 ) + return make_empty_strings_column(mr,stream); + if( patterns.empty() ) // no patterns; just return a copy + return std::make_unique(strings.parent()); + + CUDF_EXPECTS( !repls.has_nulls(), "Parameter repls must not have any nulls"); + + auto strings_column = column_device_view::create(strings.parent(),stream); + auto d_strings = *strings_column; + auto repls_column = column_device_view::create(repls.parent(),stream); + auto d_repls = *repls_column; + auto d_flags = get_character_flags_table(); + // compile regexes into device objects + size_type regex_insts = 0; + std::vector > > h_progs; + rmm::device_vector progs; + for( auto itr = patterns.begin(); itr != patterns.end(); ++itr ) + { + auto prog = reprog_device::create(*itr,d_flags,strings_count,stream); + auto insts = prog->insts_counts(); + if( insts > regex_insts ) + regex_insts = insts; + progs.push_back(*prog); + h_progs.emplace_back(std::move(prog)); + } + auto d_progs = progs.data().get(); + + // copy null mask + auto null_mask = copy_bitmask(strings.parent()); + auto null_count = strings.null_count(); + + // create child columns + std::pair< std::unique_ptr, std::unique_ptr > children(nullptr,nullptr); + // Each invocation is predicated on the stack size which is dependent on the number of regex instructions + if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) ) + children = make_strings_children( + replace_multi_regex_fn{d_strings,d_progs,static_cast(progs.size()),d_repls}, + strings_count, null_count, mr, stream); + else if( regex_insts <= RX_MEDIUM_INSTS ) + children = make_strings_children( + replace_multi_regex_fn{d_strings,d_progs,static_cast(progs.size()),d_repls}, + strings_count, null_count, mr, stream); + else + children = make_strings_children( + replace_multi_regex_fn{d_strings,d_progs,static_cast(progs.size()),d_repls}, + strings_count, null_count, mr, stream); + // + return make_strings_column(strings_count, std::move(children.first), std::move(children.second), + null_count, std::move(null_mask), stream, mr); +} + +} // namespace detail + +// external API + +std::unique_ptr replace_re( strings_column_view const& strings, + std::vector const& patterns, + strings_column_view const& repls, + rmm::mr::device_memory_resource* mr ) +{ + return detail::replace_re(strings, patterns, repls, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu new file mode 100644 index 00000000000..16960518523 --- /dev/null +++ b/cpp/src/strings/replace/replace_re.cu @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace cudf +{ +namespace strings +{ +namespace detail +{ +namespace +{ + +/** + * @brief This functor handles replacing strings by applying the compiled regex pattern + * and inserting the new string within the matched range of characters. + * + * The logic includes computing the size of each string and also writing the output. + * + * The stack is used to keep progress on evaluating the regex instructions on each string. + * So the size of the stack is in proportion to the number of instructions in the given regex pattern. + * + * There are three call types based on the number of regex instructions in the given pattern. + * Small to medium instruction lengths can use the stack effectively though smaller executes faster. + * Longer patterns require global memory. Shorter patterns are common in data cleaning. + * + */ +template +struct replace_regex_fn +{ + column_device_view const d_strings; + reprog_device prog; + string_view const d_repl; + size_type maxrepl; + const int32_t* d_offsets{}; // these are null when + char* d_chars{}; // only computing size + + __device__ size_type operator()(size_type idx) + { + if( d_strings.is_null(idx) ) + return 0; + u_char data1[stack_size]; + u_char data2[stack_size]; + prog.set_stack_mem(data1,data2); + string_view d_str = d_strings.element(idx); + auto mxn = maxrepl; + auto nchars = d_str.length(); // number of characters in input string + auto nbytes = d_str.size_bytes(); // number of bytes in input string + if( mxn < 0 ) + mxn = nchars; // max possible replaces for this string + const char* in_ptr = d_str.data(); // input pointer (i) + char* out_ptr = nullptr; // running output pointer (o) + if( d_offsets ) + out_ptr = d_chars + d_offsets[idx]; + size_type lpos = 0; + size_type begin = 0; + size_type end = nchars; // working vars + // copy input to output replacing strings as we go + while( mxn-- > 0 ) // maximum number of replaces + { + if( prog.find(idx,d_str,begin,end) <= 0 ) + break; // no more matches + auto spos = d_str.byte_offset(begin); // get offset for these + auto epos = d_str.byte_offset(end); // character position values + nbytes += d_repl.size_bytes() - (epos - spos); // compute new size + if( out_ptr ) // replace + { // i:bbbbsssseeee + out_ptr = copy_and_increment(out_ptr,in_ptr+lpos,spos-lpos); // o:bbbb + out_ptr = copy_string(out_ptr, d_repl); // o:bbbbrrrrrr + // out_ptr ---^ + lpos = epos; // i:bbbbsssseeee + } // in_ptr --^ + begin = end; + end = nchars; + } + if( out_ptr ) // copy the remainder + memcpy(out_ptr, in_ptr+lpos, d_str.size_bytes()-lpos); // o:bbbbrrrrrreeee + return nbytes; + } +}; + +} // namespace + +// +std::unique_ptr replace_re( strings_column_view const& strings, + std::string const& pattern, + string_scalar const& repl = string_scalar(""), + size_type maxrepl = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + if( strings_count==0 ) + return make_empty_strings_column(mr,stream); + + CUDF_EXPECTS( repl.is_valid(), "Parameter repl must be valid"); + string_view d_repl( repl.data(), repl.size() ); + + auto strings_column = column_device_view::create(strings.parent(),stream); + auto d_strings = *strings_column; + // compile regex into device object + auto prog = reprog_device::create(pattern,get_character_flags_table(),strings_count,stream); + auto d_prog = *prog; + auto regex_insts = d_prog.insts_counts(); + + // copy null mask + auto null_mask = copy_bitmask(strings.parent()); + auto null_count = strings.null_count(); + + // create child columns + std::pair< std::unique_ptr, std::unique_ptr > children(nullptr,nullptr); + // Each invocation is predicated on the stack size which is dependent on the number of regex instructions + if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) ) + children = make_strings_children(replace_regex_fn{d_strings,d_prog,d_repl,maxrepl}, + strings_count, null_count, mr, stream); + else if( regex_insts <= RX_MEDIUM_INSTS ) + children = make_strings_children(replace_regex_fn{d_strings,d_prog,d_repl,maxrepl}, + strings_count, null_count, mr, stream); + else + children = make_strings_children(replace_regex_fn{d_strings,d_prog,d_repl,maxrepl}, + strings_count, null_count, mr, stream); + // + return make_strings_column(strings_count, std::move(children.first), std::move(children.second), + null_count, std::move(null_mask), stream, mr); +} + +} // namespace detail + +// external API + +std::unique_ptr replace_re( strings_column_view const& strings, + std::string const& pattern, + string_scalar const& repl, + size_type maxrepl, + rmm::mr::device_memory_resource* mr ) +{ + return detail::replace_re(strings, pattern, repl, maxrepl, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index 8a11bd4b21b..5a159c4ac94 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -57,6 +57,38 @@ __device__ inline char* copy_string( char* buffer, const string_view& d_string ) } +/** + * @brief Creates child offsets and chars columns by applying the template function that + * can be used for computing the output size of each string as well as create the output. + * + * @tparam SizeAndExecuteFunction Function must accept an index and return a size. + * It must also have members d_offsets and d_chars which are set to + * memory containing the offsets and chars columns during write. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string. + * After that, the d_offsets and d_chars are set and this is called again to fill in the chars memory. + * @param strings_count Number of strings. + * @param null_count Number of nulls in the strings column. + * @param mr Memory resource to use. + * @param stream Stream to use for any kernel calls. + * @return offsets child column and chars child column for a strings column + */ +template +auto make_strings_children( SizeAndExecuteFunction size_and_exec_fn, size_type strings_count, size_type null_count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + auto transformer = thrust::make_transform_iterator( thrust::make_counting_iterator(0), size_and_exec_fn ); + auto offsets_column = make_offsets_child_column(transformer, transformer + strings_count, mr, stream); + auto d_offsets = offsets_column->view().template data(); + auto chars_column = create_chars_child_column( strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], mr, stream ); + size_and_exec_fn.d_offsets = d_offsets; // set the offsets + size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); // fill in the chars + thrust::for_each_n(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), strings_count, size_and_exec_fn); + return std::make_pair(std::move(offsets_column),std::move(chars_column)); +} + + /** * @brief Utility to create a null mask for a strings column using a custom function. * diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 053b5b9b3c1..d313e90bfdb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -778,6 +778,7 @@ set(STRINGS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/strings/integers_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/ipv4_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/pad_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/replace_regex_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/replace_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/split_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/strip_tests.cpp" diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp new file mode 100644 index 00000000000..f38f7ec6dbf --- /dev/null +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "./utilities.h" + +#include +#include + + +struct StringsReplaceTests : public cudf::test::BaseFixture {}; + + +TEST_F(StringsReplaceTests, ReplaceRegexTest) +{ + std::vector h_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", nullptr }; + + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + std::vector h_expected{ "= quick brown fox jumps over = lazy dog", + "= fat cat lays next to = other accénted cat", + "a slow moving turtlé cannot catch = bird", + "which can be composéd together to form a more complete", + "thé result does not include = value in = sum in", + "", nullptr }; + + std::string pattern = "(\\bthe\\b)"; + auto results = cudf::strings::replace_re(strings_view,pattern,cudf::string_scalar("=")); + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(*results,expected); +} + +TEST_F(StringsReplaceTests, ReplaceMultiRegexTest) +{ + std::vector h_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", nullptr }; + + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + std::vector h_expected{ " quick brown fox jumps over lazy dog", + " fat cat lays next to other accénted cat", + "** slow moving turtlé cannot catch bird", + "which can be composéd together to form ** more complete", + "thé result does not include value N sum N", + "", nullptr }; + + std::vector patterns{"\\bthe\\b","\\bin\\b","\\ba\\b"}; + std::vector h_repls{ "", "N", "**"}; + cudf::test::strings_column_wrapper repls( h_repls.begin(), h_repls.end() ); + auto repls_view = cudf::strings_column_view(repls); + auto results = cudf::strings::replace_re(strings_view,patterns,repls_view); + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(*results,expected); +} + +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) +{ + std::vector h_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", nullptr }; + + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + std::vector h_expected{ "the-quick-brown-fox-jumps-over-the-lazy-dog", + "the-fat-cat-lays-next-to-the-other-accénted-cat", + "a-slow-moving-turtlé-cannot-catch-the-bird", + "which-can-be-composéd-together-to-form-a more-complete", + "thé-result-does-not-include-the-value-in-the-sum-in", + "", nullptr }; + + std::string pattern = "(\\w) (\\w)"; + std::string repl_template = "\\1-\\2"; + auto results = cudf::strings::replace_with_backrefs(strings_view,pattern,repl_template); + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(*results,expected); +} + +TEST_F(StringsReplaceTests, MediumReplaceRegex) +{ + // This results in 95 regex instructions and falls in the 'medium' range. + std::string medium_regex = "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello http://www.world.com"; + + std::vector h_strings{ + "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello http://www.world.com thats all", + "12345678901234567890", "abcdefghijklmnopqrstuvwxyz" + }; + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::replace_re(strings_view, medium_regex); + std::vector h_expected{" thats all", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz" }; + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(*results,expected); +} + +TEST_F(StringsReplaceTests, LargeReplaceRegex) +{ + // This results in 117 regex instructions and falls in the 'large' range. + std::string large_regex = "hello @abc @def world The (quick) brown @fox jumps over the lazy @dog hello http://www.world.com I'm here @home zzzz"; + + std::vector h_strings{ + "zzzz hello @abc @def world The quick brown @fox jumps over the lazy @dog hello http://www.world.com I'm here @home zzzz", + "12345678901234567890", "abcdefghijklmnopqrstuvwxyz" + }; + cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), + thrust::make_transform_iterator( h_strings.begin(), [] (auto str) { return str!=nullptr; })); + + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::replace_re(strings_view, large_regex); + std::vector h_expected{"zzzz ", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz" }; + cudf::test::strings_column_wrapper expected( h_expected.begin(), h_expected.end(), + thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; })); + cudf::test::expect_columns_equal(*results,expected); +}