Skip to content

Commit

Permalink
Merge pull request #3409 from davidwendt/port-nvs-regex-replace
Browse files Browse the repository at this point in the history
[REVIEW] Port NVStrings regex replace functions to cudf strings column
  • Loading branch information
jrhemstad authored Jan 17, 2020
2 parents 0faa98d + 74dcfe4 commit 70b1df0
Show file tree
Hide file tree
Showing 9 changed files with 859 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
## Improvements

- PR #3292 Port NVStrings regex contains function
- PR #3409 Port NVStrings regex replace function
- PR #3417 Port NVStrings regex findall function
- PR #3351 Add warning when filepath resolves to multiple files in cudf readers
- PR #3370 Port NVStrings strip functions
Expand Down
3 changes: 3 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,9 @@ add_library(cudf
src/strings/padding.cu
src/strings/regex/regcomp.cpp
src/strings/regex/regexec.cu
src/strings/replace/replace_re.cu
src/strings/replace/backref_re.cu
src/strings/replace/multi_re.cu
src/strings/replace/replace.cu
src/strings/sorting/sorting.cu
src/strings/split/partition.cu
Expand Down
82 changes: 82 additions & 0 deletions cpp/include/cudf/strings/replace_re.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>

namespace cudf
{
namespace strings
{

/**
* @brief For each string, replaces any character sequence matching the given pattern
* with the provided replacement string.
*
* Any null string entries return corresponding null output column entries.
*
* @param strings Strings instance for this operation.
* @param pattern The regular expression pattern to search within each string.
* @param repl The string used to replace the matched sequence in each string.
* Default is an empty string.
* @param maxrepl The maximum number of times to replace the matched pattern within each string.
* @param mr Resource for allocating device memory.
* @return New strings column.
*/
std::unique_ptr<column> replace_re( strings_column_view const& strings,
std::string const& pattern,
string_scalar const& repl = string_scalar(""),
size_type maxrepl = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief For each string, replaces any character sequence matching the given patterns
* with the corresponding string in the repls column.
*
* Any null string entries return corresponding null output column entries.
*
* @param strings Strings instance for this operation.
* @param pattern The regular expression patterns to search within each string.
* @param repls The strings used for replacement.
* @param mr Resource for allocating device memory.
* @return New strings column.
*/
std::unique_ptr<column> replace_re( strings_column_view const& strings,
std::vector<std::string> const& patterns,
strings_column_view const& repls,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief For each string, replaces any character sequence matching the given pattern
* using the repl template for back-references.
*
* Any null string entries return corresponding null output column entries.
*
* @param strings Strings instance for this operation.
* @param pattern The regular expression patterns to search within each string.
* @param repl The replacement template for creating the output string.
* @param mr Resource for allocating device memory.
* @return New strings column.
*/
std::unique_ptr<column> replace_with_backrefs( strings_column_view const& strings,
std::string const& pattern,
std::string const& repl,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

} // namespace strings
} // namespace cudf
235 changes: 235 additions & 0 deletions cpp/src/strings/replace/backref_re.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/null_mask.hpp>
#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/char_types/char_types.hpp>
#include <cudf/strings/replace_re.hpp>
#include <strings/utilities.hpp>
#include <strings/utilities.cuh>
#include <strings/regex/regex.cuh>

#include <regex>

namespace cudf
{
namespace strings
{
namespace detail
{
namespace
{

using backref_type = thrust::pair<size_type,size_type>;

/**
* @brief Parse the back-ref index and position values from a given replace format.
*
* The backref numbers are expected to be 1-based.
*
* Returns a modified string without back-ref indicators.
* ```
* Example:
* for input string: 'hello \2 and \1'
* the returned pairs: (2,6),(1,11)
* returned string is: 'hello and '
* ```
*/
std::string parse_backrefs( std::string const& repl, std::vector<backref_type>& backrefs )
{
std::string str = repl; // make a modifiable copy
std::smatch m;
std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1"
std::string rtn; // result without refs
size_type byte_offset = 0;
while( std::regex_search( str, m, ex ) )
{
if( m.size()==0 )
break;
backref_type item;
std::string bref = m[0];
size_type position = static_cast<size_type>(m.position(0));
size_type length = static_cast<size_type>(bref.length());
byte_offset += position;
item.first = std::atoi(bref.c_str()+1); // back-ref index number
CUDF_EXPECTS( item.first > 0, "Back-reference numbers must be greater than 0");
item.second = byte_offset; // position within the string
rtn += str.substr(0,position);
str = str.substr(position + length);
backrefs.push_back(item);
}
if( !str.empty() ) // add the remainder
rtn += str; // of the string
return rtn;
}


/**
* @brief This functor handles replacing strings by applying the compiled regex pattern
* and inserting the at the backref position indicated in the replacement template.
*
* The logic includes computing the size of each string and also writing the output.
*
* The stack is used to keep progress on evaluating the regex instructions on each string.
* So the size of the stack is in proportion to the number of instructions in the given regex pattern.
*
* There are three call types based on the number of regex instructions in the given pattern.
* Small to medium instruction lengths can use the stack effectively though smaller executes faster.
* Longer patterns require global memory. Shorter patterns are common in data cleaning.
*
*/
template<size_t stack_size>
struct backrefs_fn
{
column_device_view const d_strings;
reprog_device prog;
string_view const d_repl; // string replacement template
rmm::device_vector<backref_type>::iterator backrefs_begin;
rmm::device_vector<backref_type>::iterator backrefs_end;
const int32_t* d_offsets{}; // these are null when
char* d_chars{}; // only computing size

__device__ size_type operator()(size_type idx)
{
if( d_strings.is_null(idx) )
return 0;
u_char data1[stack_size];
u_char data2[stack_size];
prog.set_stack_mem(data1,data2);
string_view d_str = d_strings.element<string_view>(idx);
auto nchars = d_str.length(); // number of characters in input string
auto nbytes = d_str.size_bytes(); // number of bytes in input string
const char* in_ptr = d_str.data();
char* out_ptr = d_offsets ? (d_chars + d_offsets[idx]) : nullptr;
size_type lpos = 0; // last byte position processed in d_str
size_type begin = 0; // first character position matching regex
size_type end = nchars; // last character position (exclusive)
// copy input to output replacing strings as we go
while( prog.find(idx,d_str,begin,end) > 0 ) // inits the begin/end vars
{
auto spos = d_str.byte_offset(begin); // get offset for these
auto epos = d_str.byte_offset(end); // character position values
nbytes += d_repl.size_bytes() - (epos - spos); // compute new size
if( out_ptr )
out_ptr = copy_and_increment(out_ptr,in_ptr+lpos,spos-lpos);
size_type lpos_template = 0; // last end pos of replace template
auto repl_ptr = d_repl.data(); // replace template pattern
thrust::for_each( thrust::seq, backrefs_begin, backrefs_end,
[&] __device__ (backref_type backref)
{
if( out_ptr )
{
auto const copy_length = backref.second - lpos_template;
out_ptr = copy_and_increment(out_ptr, repl_ptr, copy_length );
repl_ptr += copy_length;
lpos_template += copy_length;
}
// extract the specific group's string for this backref's index
size_type spos_extract = begin; // these are modified
size_type epos_extract = end; // by extract()
if( (prog.extract(idx,d_str,spos_extract,epos_extract,backref.first-1)<=0 ) ||
(epos_extract <= spos_extract) )
return; // no value for this backref number; that is ok
spos_extract = d_str.byte_offset(spos_extract); // convert
epos_extract = d_str.byte_offset(epos_extract); // to bytes
nbytes += epos_extract - spos_extract;
if( out_ptr )
out_ptr = copy_and_increment(out_ptr, d_str.data()+spos_extract, (epos_extract-spos_extract));
});
if( out_ptr && (lpos_template < d_repl.size_bytes()) )// copy remainder of template
out_ptr = copy_and_increment(out_ptr, repl_ptr+lpos_template, d_repl.size_bytes() - lpos_template);
lpos = epos;
begin = end;
end = nchars;
}
if( out_ptr && (lpos < d_str.size_bytes()) ) // copy remainder of input string
memcpy(out_ptr, in_ptr+lpos, d_str.size_bytes()-lpos );
return nbytes;
}
};

} // namespace

//
std::unique_ptr<column> replace_with_backrefs( strings_column_view const& strings,
std::string const& pattern,
std::string const& repl,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0)
{
auto strings_count = strings.size();
if( strings_count==0 )
return make_empty_strings_column(mr,stream);

CUDF_EXPECTS( !pattern.empty(), "Parameter pattern must not be empty");
CUDF_EXPECTS( !repl.empty(), "Parameter repl must not be empty");

auto strings_column = column_device_view::create(strings.parent(),stream);
auto d_strings = *strings_column;
// compile regex into device object
auto prog = reprog_device::create(pattern,get_character_flags_table(),strings_count,stream);
auto d_prog = *prog;
auto regex_insts = d_prog.insts_counts();

// parse the repl string for backref indicators
std::vector<backref_type> h_backrefs;
std::string repl_template = parse_backrefs(repl,h_backrefs);
rmm::device_vector<backref_type> backrefs(h_backrefs);
string_scalar repl_scalar(repl_template);
string_view d_repl_template{ repl_scalar.data(), repl_scalar.size() };

// copy null mask
auto null_mask = copy_bitmask(strings.parent());
auto null_count = strings.null_count();

// create child columns
std::pair< std::unique_ptr<column>, std::unique_ptr<column> > children(nullptr,nullptr);
// Each invocation is predicated on the stack size which is dependent on the number of regex instructions
if( (regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS) )
children = make_strings_children(backrefs_fn<RX_STACK_SMALL>{d_strings,d_prog,d_repl_template,
backrefs.begin(), backrefs.end()},
strings_count, null_count, mr, stream);
else if( regex_insts <= RX_MEDIUM_INSTS )
children = make_strings_children(backrefs_fn<RX_STACK_MEDIUM>{d_strings,d_prog,d_repl_template,
backrefs.begin(), backrefs.end()},
strings_count, null_count, mr, stream);
else
children = make_strings_children(backrefs_fn<RX_STACK_LARGE>{d_strings,d_prog,d_repl_template,
backrefs.begin(), backrefs.end()},
strings_count, null_count, mr, stream);
//
return make_strings_column(strings_count, std::move(children.first), std::move(children.second),
null_count, std::move(null_mask), stream, mr);
}

} // namespace detail

// external API

std::unique_ptr<column> replace_with_backrefs( strings_column_view const& strings,
std::string const& pattern,
std::string const& repl,
rmm::mr::device_memory_resource* mr )
{
return detail::replace_with_backrefs(strings, pattern, repl, mr);
}

} // namespace strings
} // namespace cudf
Loading

0 comments on commit 70b1df0

Please sign in to comment.