Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup libcudf strings regex classes #10573

Merged
merged 16 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <strings/regex/regcomp.h>

#include <cudf/strings/detail/utf8.hpp>
#include <cudf/utilities/error.hpp>

#include <algorithm>
Expand Down Expand Up @@ -58,6 +59,37 @@ const std::array<char, 33> escapable_chars{
{'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
'"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};

/**
* @brief Converts UTF-8 string into fixed-width 32-bit character vector.
*
* No character conversion occurs.
* Each UTF-8 character is promoted into a 32-bit value.
* The last entry in the returned vector will be a 0 value.
* The fixed-width vector makes it easier to compile and faster to execute.
*
* @param pattern Regular expression encoded with UTF-8.
* @return Fixed-width 32-bit character vector.
*/
std::vector<char32_t> string_to_char32_vector(std::string const& pattern)
{
size_type size = static_cast<size_type>(pattern.size());
size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
return is_begin_utf8_char(static_cast<uint8_t>(ch));
});
std::vector<char32_t> result(count + 1);
char32_t* output_ptr = result.data();
const char* input_ptr = pattern.data();
for (size_type idx = 0; idx < size; ++idx) {
char_utf8 output_character = 0;
size_type ch_width = to_char_utf8(input_ptr, output_character);
input_ptr += ch_width;
idx += ch_width - 1;
*output_ptr++ = output_character;
}
result[count] = 0; // last entry set to 0
return result;
}

} // namespace

int32_t reprog::add_inst(int32_t t)
Expand Down Expand Up @@ -838,10 +870,11 @@ class regex_compiler {
};

// Convert pattern into program
reprog reprog::create_from(const char32_t* pattern, regex_flags const flags)
reprog reprog::create_from(std::string const& pattern, regex_flags const flags)
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
{
reprog rtn;
regex_compiler compiler(pattern, flags, rtn);
auto pattern32 = string_to_char32_vector(pattern);
regex_compiler compiler(pattern32.data(), flags, rtn);
// for debugging, it can be helpful to call rtn.print(flags) here to dump
// out the instructions that have been created from the given pattern
return rtn;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/regex/regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class reprog {
* @brief Parses the given regex pattern and compiles
* into a list of chained instructions.
*/
static reprog create_from(const char32_t* pattern, regex_flags const flags);
static reprog create_from(std::string const& pattern, regex_flags const flags);

int32_t add_inst(int32_t type);
int32_t add_inst(reinst inst);
Expand Down
93 changes: 48 additions & 45 deletions cpp/src/strings/regex/regex.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,7 +25,6 @@
#include <thrust/optional.h>
#include <thrust/pair.h>

#include <functional>
#include <memory>

namespace cudf {
Expand All @@ -35,17 +34,15 @@ class string_view;
namespace strings {
namespace detail {

struct reljunk;
struct reinst;
class reprog;
struct relist;

using match_pair = thrust::pair<cudf::size_type, cudf::size_type>;
using match_result = thrust::optional<match_pair>;

constexpr int32_t RX_STACK_SMALL = 112; ///< fastest stack size
constexpr int32_t RX_STACK_MEDIUM = 1104; ///< faster stack size
constexpr int32_t RX_STACK_LARGE = 10128; ///< fast stack size
constexpr int32_t RX_STACK_ANY = 8; ///< slowest: uses global memory
constexpr int32_t RX_STACK_SMALL = 112; ///< fastest stack size
constexpr int32_t RX_STACK_MEDIUM = 1104; ///< faster stack size
constexpr int32_t RX_STACK_LARGE = 2560; ///< fast stack size
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
constexpr int32_t RX_STACK_ANY = 8; ///< slowest: uses global memory

/**
* @brief Mapping the number of instructions to device code stack memory size.
Expand All @@ -71,7 +68,7 @@ class reclass_device {
int32_t count{};
char32_t* literals{};

__device__ bool is_match(char32_t ch, const uint8_t* flags);
__device__ bool is_match(char32_t ch, uint8_t const* flags) const;
};

/**
Expand Down Expand Up @@ -132,15 +129,7 @@ class reprog_device {
/**
* @brief Returns the number of regex instructions.
*/
[[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; }

/**
* @brief Returns true if this is an empty program.
*/
[[nodiscard]] __device__ bool is_empty() const
{
return insts_counts() == 0 || get_inst(0)->type == END;
}
[[nodiscard]] CUDF_HOST_DEVICE int32_t insts_counts() const { return _insts_count; }

/**
* @brief Returns the number of regex groups found in the expression.
Expand All @@ -151,19 +140,9 @@ class reprog_device {
}

/**
* @brief Returns the regex instruction object for a given index.
*/
[[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const;

/**
* @brief Returns the regex class object for a given index.
*/
[[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const;

/**
* @brief Returns the start-instruction-ids vector.
* @brief Returns true if this is an empty program.
*/
[[nodiscard]] __device__ inline int32_t* startinst_ids() const;
[[nodiscard]] __device__ inline bool is_empty() const;

/**
* @brief Does a find evaluation using the compiled expression on the given string.
Expand All @@ -180,9 +159,9 @@ class reprog_device {
*/
template <int stack_size>
__device__ inline int32_t find(int32_t idx,
string_view const& d_str,
int32_t& begin,
int32_t& end);
string_view const d_str,
cudf::size_type& begin,
cudf::size_type& end);

/**
* @brief Does an extract evaluation using the compiled expression on the given string.
Expand All @@ -204,34 +183,58 @@ class reprog_device {
*/
template <int stack_size>
__device__ inline match_result extract(cudf::size_type idx,
string_view const& d_str,
string_view const d_str,
cudf::size_type begin,
cudf::size_type end,
cudf::size_type group_id);

private:
int32_t _startinst_id, _num_capturing_groups;
int32_t _insts_count, _starts_count, _classes_count;
const uint8_t* _codepoint_flags{}; // table of character types
reinst* _insts{}; // array of regex instructions
int32_t* _startinst_ids{}; // array of start instruction ids
reclass_device* _classes{}; // array of regex classes
void* _relists_mem{}; // runtime relist memory for regexec
struct reljunk {
relist* list1;
relist* list2;
int32_t starttype{};
char32_t startchar{};
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
__device__ inline reljunk(relist* list1, relist* list2, reinst const inst);
__device__ inline void swaplist();
};

/**
* @brief Returns the regex instruction object for a given id.
*/
__device__ inline reinst get_inst(int32_t id) const;

/**
* @brief Returns the regex class object for a given id.
*/
__device__ inline reclass_device get_class(int32_t id) const;

/**
* @brief Executes the regex pattern on the given string.
*/
__device__ inline int32_t regexec(
string_view const& d_str, reljunk& jnk, int32_t& begin, int32_t& end, int32_t group_id = 0);
string_view const d_str, reljunk jnk, int32_t& begin, int32_t& end, int32_t group_id = 0);

/**
* @brief Utility wrapper to setup state memory structures for calling regexec
*/
template <int stack_size>
__device__ inline int32_t call_regexec(
int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t group_id = 0);
int32_t idx, string_view const d_str, int32_t& begin, int32_t& end, int32_t group_id = 0);

reprog_device(reprog&);

int32_t _startinst_id;
int32_t _num_capturing_groups;
int32_t _insts_count;
int32_t _starts_count;
int32_t _classes_count;

uint8_t const* _codepoint_flags{}; // table of character types
reinst const* _insts{}; // array of regex instructions
int32_t const* _startinst_ids{}; // array of start instruction ids
reclass_device const* _classes{}; // array of regex classes

reprog_device(reprog&); // must use create()
void* _relists_mem{}; // runtime relist memory for regexec()
};

} // namespace detail
Expand Down
Loading