Skip to content

Commit

Permalink
Regex flags /msin
Browse files Browse the repository at this point in the history
  • Loading branch information
eliaskosunen committed Dec 3, 2023
1 parent df082bc commit 88f3f52
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ jobs:
uses: lukka/get-cmake@latest

- name: Install re2
if: matrix.engine == "re2"
if: ${{ matrix.engine == "re2" }}
run: |
git clone https://github.com/google/re2 -b 2023-11-01 --depth=1
cd re2
Expand Down
83 changes: 76 additions & 7 deletions include/scn/detail/format_string_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,48 @@ namespace scn {
pointer, // 'p'
};

enum class regex_flags {
none = 0,
multiline = 1, // /m
singleline = 2, // /s
nocase = 4, // /i
nocapture = 8, // /n
// TODO?
// would probably need to go hand-in-hand with locale,
// where it could even be the default/only option -> no flag?
// why else would you even use locale with a regex?
// collate = 16,
};

constexpr regex_flags operator&(regex_flags a, regex_flags b)
{
return static_cast<regex_flags>(static_cast<unsigned>(a) &
static_cast<unsigned>(b));
}
constexpr regex_flags operator|(regex_flags a, regex_flags b)
{
return static_cast<regex_flags>(static_cast<unsigned>(a) |
static_cast<unsigned>(b));
}
constexpr regex_flags operator^(regex_flags a, regex_flags b)
{
return static_cast<regex_flags>(static_cast<unsigned>(a) ^
static_cast<unsigned>(b));
}

constexpr regex_flags& operator&=(regex_flags& a, regex_flags b)
{
return a = a & b;
}
constexpr regex_flags& operator|=(regex_flags& a, regex_flags b)
{
return a = a | b;
}
constexpr regex_flags& operator^=(regex_flags& a, regex_flags b)
{
return a = a ^ b;
}

template <typename CharT>
struct basic_format_specs {
int width{0};
Expand All @@ -64,7 +106,7 @@ namespace scn {
std::array<uint8_t, 128 / 8> charset_literals{0};
bool charset_has_nonascii{false}, charset_is_inverted{false};
std::basic_string_view<CharT> charset_string{};
std::basic_string_view<CharT> regex_flags{};
regex_flags regexp_flags{regex_flags::none};
unsigned arbitrary_base : 6;
unsigned align : 2;
bool localized : 1;
Expand Down Expand Up @@ -205,9 +247,9 @@ namespace scn {
{
m_specs.charset_string = pattern;
}
constexpr void on_regex_flags(std::basic_string_view<CharT> flags)
constexpr void on_regex_flags(regex_flags flags)
{
m_specs.regex_flags = flags;
m_specs.regexp_flags = flags;
}

constexpr void on_thsep()
Expand Down Expand Up @@ -640,21 +682,48 @@ namespace scn {
return begin;
}

regex_flags flags{regex_flags::none};
constexpr std::array<std::pair<char, regex_flags>, 4> flag_map{
{{'m', regex_flags::multiline},
{'s', regex_flags::singleline},
{'i', regex_flags::nocase},
{'n', regex_flags::nocapture}}};
for (; begin != end; ++begin) {
if (*begin == CharT{'}'}) {
break;
}
bool found_flag = false;
for (auto flag : flag_map) {
if (static_cast<CharT>(flag.first) != *begin) {
continue;
}
if ((flags & flag.second) != regex_flags::none) {
handler.on_error("Flag set multiple times in regex");
return begin;
}
#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD
if (*begin == CharT{'s'}) {
handler.on_error(
"/s flag for regex isn't supported by regex "
"backend");
}
#endif
flags |= flag.second;
found_flag = true;
break;
}
if (!found_flag) {
handler.on_error("Invalid flag in regex");
return begin;
}
}
handler.on_regex_flags(flags);

if (SCN_UNLIKELY(begin == end)) {
handler.on_error("Unexpected end of regex in format string");
return begin;
}

auto flags_end = begin;
handler.on_regex_flags(
make_string_view_from_pointers(regex_end + 1, flags_end));

return begin;
#else
handler.on_error("Regular expression support is disabled");
Expand Down
137 changes: 124 additions & 13 deletions src/scn/impl/reader/regex_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,95 @@ namespace scn {
SCN_BEGIN_NAMESPACE

namespace impl {
#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD
constexpr auto make_regex_flags(detail::regex_flags flags)
-> scan_expected<std::regex_constants::syntax_option_type>
{
std::regex_constants::syntax_option_type result{};
if ((flags & detail::regex_flags::multiline) !=
detail::regex_flags::none) {
result |= std::regex_constants::multiline;
}
if ((flags & detail::regex_flags::singleline) !=
detail::regex_flags::none) {
return unexpected_scan_error(
scan_error::invalid_format_string,
"/s flag for regex isn't supported by regex backend");
}
if ((flags & detail::regex_flags::nocase) !=
detail::regex_flags::none) {
result |= std::regex_constants::icase;
}
if ((flags & detail::regex_flags::nocapture) !=
detail::regex_flags::none) {
result |= std::regex_constants::nosubs;
}
return result;
}
#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_BOOST
constexpr auto make_regex_flags(detail::regex_flags flags)
-> boost::regex_constants::syntax_option_type
{
boost::regex_constants::syntax_option_type result{};
if ((flags & detail::regex_flags::multiline) ==
detail::regex_flags::none) {
result |= boost::regex_constants::no_mod_m;
}
if ((flags & detail::regex_flags::singleline) !=
detail::regex_flags::none) {
result |= boost::regex_constants::mod_s;
}
if ((flags & detail::regex_flags::nocase) !=
detail::regex_flags::none) {
result |= boost::regex_constants::icase;
}
if ((flags & detail::regex_flags::nocapture) !=
detail::regex_flags::none) {
result |= boost::regex_constants::nosubs;
}
return result;
}
#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2
auto make_regex_flags(detail::regex_flags flags)
-> std::pair<RE2::Options, std::string_view>
{
RE2::Options opt{RE2::Quiet};
std::string_view stringflags{};

if ((flags & detail::regex_flags::multiline) ==
detail::regex_flags::none) {
stringflags = "(?m)";
}
if ((flags & detail::regex_flags::singleline) !=
detail::regex_flags::none) {
opt.set_dot_nl(true);
}
if ((flags & detail::regex_flags::nocase) !=
detail::regex_flags::none) {
opt.set_case_sensitive(false);
}
if ((flags & detail::regex_flags::nocapture) !=
detail::regex_flags::none) {
opt.set_never_capture(true);
}

return {opt, stringflags};
}
#endif

template <typename CharT>
auto read_regex_string_impl(std::basic_string_view<CharT> pattern,
detail::regex_flags flags,
std::basic_string_view<CharT> input)
-> scan_expected<typename std::basic_string_view<CharT>::iterator>
{
#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD
std::basic_regex<CharT> re{};
try {
re = std::basic_regex<CharT>{pattern.data(), pattern.size(),
std::basic_regex<CharT>::nosubs};
SCN_TRY(re_flags, make_regex_flags(flags));
re = std::basic_regex<CharT>{
pattern.data(), pattern.size(),
re_flags | std::regex_constants::nosubs};
}
catch (const std::regex_error& err) {
return unexpected_scan_error(scan_error::invalid_format_string,
Expand Down Expand Up @@ -81,12 +160,15 @@ namespace scn {
#if SCN_REGEX_BOOST_USE_ICU
boost::make_u32regex(pattern.data(),
pattern.data() + pattern.size(),
boost::regex_constants::no_except |
make_regex_flags(flags) |
boost::regex_constants::no_except |
boost::regex_constants::nosubs);
#else
boost::basic_regex<CharT>{pattern.data(), pattern.size(),
boost::regex_constants::no_except |
boost::regex_constants::nosubs};
boost::basic_regex<CharT>{
pattern.data(), pattern.size(),
make_regex_flags(flags) |
boost::regex_constants::no_except |
boost::regex_constants::nosubs};
#endif
if (re.status() != 0) {
return unexpected_scan_error(scan_error::invalid_format_string,
Expand Down Expand Up @@ -121,7 +203,18 @@ namespace scn {
ranges::distance(input.data(), matches[0].second);
#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2
static_assert(std::is_same_v<CharT, char>);
auto re = re2::RE2{pattern, RE2::Quiet};
std::string flagged_pattern{};
auto re = [&]() {
auto [opts, flagstr] = make_regex_flags(flags);
opts.set_never_capture(true);
if (flagstr.empty()) {
return re2::RE2{pattern, opts};
}
flagged_pattern.reserve(flagstr.size() + pattern.size());
flagged_pattern.append(flagstr);
flagged_pattern.append(pattern);
return re2::RE2{flagged_pattern, opts};
}();
if (!re.ok()) {
return unexpected_scan_error(
scan_error::invalid_format_string,
Expand All @@ -141,14 +234,17 @@ namespace scn {

template <typename CharT>
auto read_regex_matches_impl(std::basic_string_view<CharT> pattern,
detail::regex_flags flags,
std::basic_string_view<CharT> input,
basic_regex_matches<CharT>& value)
-> scan_expected<typename std::basic_string_view<CharT>::iterator>
{
#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD
std::basic_regex<CharT> re{};
try {
re = std::basic_regex<CharT>{pattern.data(), pattern.size()};
SCN_TRY(re_flags, make_regex_flags(flags));
re = std::basic_regex<CharT>{pattern.data(), pattern.size(),
re_flags};
}
catch (const std::regex_error& err) {
return unexpected_scan_error(scan_error::invalid_format_string,
Expand Down Expand Up @@ -215,10 +311,13 @@ namespace scn {
#if SCN_REGEX_BOOST_USE_ICU
boost::make_u32regex(pattern.data(),
pattern.data() + pattern.size(),
boost::regex_constants::no_except);
make_regex_flags(flags) |
boost::regex_constants::no_except);
#else
boost::basic_regex<CharT>{pattern.data(), pattern.size(),
boost::regex_constants::no_except};
boost::basic_regex<CharT>{
pattern.data(), pattern.size(),
make_regex_flags(flags) |
boost::regex_constants::no_except};
#endif
if (re.status() != 0) {
return unexpected_scan_error(scan_error::invalid_format_string,
Expand Down Expand Up @@ -272,13 +371,24 @@ namespace scn {
ranges::distance(input.data(), matches[0].second);
#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2
static_assert(std::is_same_v<CharT, char>);
auto re = re2::RE2{pattern, RE2::Quiet};
std::string flagged_pattern{};
auto re = [&]() {
auto [opts, flagstr] = make_regex_flags(flags);
if (flagstr.empty()) {
return re2::RE2{pattern, opts};
}
flagged_pattern.reserve(flagstr.size() + pattern.size());
flagged_pattern.append(flagstr);
flagged_pattern.append(pattern);
return re2::RE2{flagged_pattern, opts};
}();
if (!re.ok()) {
return unexpected_scan_error(
scan_error::invalid_format_string,
"Failed to parse regular expression");
}
size_t max_matches_n =
// TODO: Optimize into a single batch allocation
const auto max_matches_n =
static_cast<size_t>(re.NumberOfCapturingGroups());
std::vector<std::optional<std::string_view>> matches(max_matches_n);
std::vector<re2::RE2::Arg> match_args(max_matches_n);
Expand Down Expand Up @@ -368,6 +478,7 @@ namespace scn {
ranges::data(range),
ranges::data(range) + ranges::size(range));
SCN_TRY(it, read_regex_matches_impl(specs.charset_string,
specs.regexp_flags,
input, value));
return ranges::begin(range) +
ranges::distance(input.begin(), it);
Expand Down
14 changes: 9 additions & 5 deletions src/scn/impl/reader/string_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,26 +135,29 @@ namespace scn {
scan_expected<simple_borrowed_iterator_t<Range>> read(
Range&& range,
std::basic_string_view<SourceCharT> pattern,
detail::regex_flags flags,
std::basic_string<ValueCharT>& value)
{
SCN_TRY(it, impl(range, pattern));
SCN_TRY(it, impl(range, pattern, flags));
return read_string_impl(range, it, value);
}

template <typename Range, typename ValueCharT>
scan_expected<simple_borrowed_iterator_t<Range>> read(
Range&& range,
std::basic_string_view<SourceCharT> pattern,
detail::regex_flags flags,
std::basic_string_view<ValueCharT>& value)
{
SCN_TRY(it, impl(range, pattern));
SCN_TRY(it, impl(range, pattern, flags));
return read_string_view_impl(range, it, value);
}

private:
template <typename Range>
auto impl(Range&& range,
std::basic_string_view<SourceCharT> pattern)
std::basic_string_view<SourceCharT> pattern,
detail::regex_flags flags)
-> scan_expected<simple_borrowed_iterator_t<Range>>
{
if constexpr (!ranges::contiguous_range<Range>) {
Expand All @@ -172,7 +175,7 @@ namespace scn {
auto input = detail::make_string_view_from_pointers(
ranges::data(range),
ranges::data(range) + ranges::size(range));
SCN_TRY(it, read_regex_string_impl(pattern, input));
SCN_TRY(it, read_regex_string_impl(pattern, flags, input));
return ranges::begin(range) +
ranges::distance(input.begin(), it);
}
Expand Down Expand Up @@ -552,7 +555,8 @@ namespace scn {
#if !SCN_DISABLE_REGEX
case reader_type::regex:
return regex_string_reader_impl<SourceCharT>{}.read(
SCN_FWD(range), specs.charset_string, value);
SCN_FWD(range), specs.charset_string,
specs.regexp_flags, value);
#endif

default:
Expand Down
Loading

0 comments on commit 88f3f52

Please sign in to comment.