Skip to content

Commit

Permalink
Merge pull request #409 from bshifter/fx-rgx-subst-matchgrps
Browse files Browse the repository at this point in the history
Filterx regexp_subst match group changes
  • Loading branch information
alltilla authored Dec 13, 2024
2 parents 567b37a + 231c51a commit 8edca70
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 33 deletions.
106 changes: 79 additions & 27 deletions lib/filterx/expr-regexp-subst.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "filterx/expr-regexp-common.h"
#include "compat/pcre.h"
#include "scratch-buffers.h"
#include <ctype.h>

DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags,
FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT_NAME,
Expand All @@ -53,6 +54,7 @@ DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags,
FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \
FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \

#define FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS 3

typedef struct FilterXFuncRegexpSubst_
{
Expand All @@ -63,43 +65,84 @@ typedef struct FilterXFuncRegexpSubst_
FLAGSET flags;
} FilterXFuncRegexpSubst;

static gchar *
_next_matchgrp_ref(gchar *from, gchar **to)
{
if (from == NULL || *from == '\0')
return NULL;
g_assert(to);
while (*from != '\0')
{
if ((*from == '\\') && isdigit(*(from + 1)))
{
gchar *start = from;
from += 2;
while (isdigit(*from) && from - start <= FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS)
{
from++;
}
*to = from;
return start;
}
from++;
}
return NULL;
}

static gboolean
_parse_machgrp_ref(const gchar *from, const gchar *to, gint *value)
{
if (!from || !to || !value || from >= to || to > from + 5)
{
return FALSE;
}

if (*from != '\\')
{
return FALSE;
}

from++;
*value = 0;

while (from < to && isdigit(*from))
{
*value = (*value * 10) + (*from - '0');
from++;
}

return from == to;
}

static gboolean
_build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state,
GString *replacement_string)
{
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data);
g_string_set_size(replacement_string, 0);
const gchar *rep_ptr = self->replacement;
const gchar *last_ptr = rep_ptr;
gint num_grps = state->rc;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data);

while (*rep_ptr)
gchar *pos = self->replacement;
gchar *last = pos;
gchar *close = NULL;
gint idx = -1;
while ((pos = _next_matchgrp_ref(pos, &close)) != NULL)
{
if (*rep_ptr == '\\')
if (_parse_machgrp_ref(pos, close, &idx) && (idx < num_grps))
{
rep_ptr++;
if (*rep_ptr >= '1' && *rep_ptr <= '9')
PCRE2_SIZE start = ovector[2 * idx];
PCRE2_SIZE end = ovector[2 * idx + 1];
if (start != PCRE2_UNSET)
{
gint grp_idx = *rep_ptr - '0';
if (grp_idx < num_grps)
{
PCRE2_SIZE start = ovector[2 * grp_idx];
PCRE2_SIZE end = ovector[2 * grp_idx + 1];
if (start != PCRE2_UNSET)
{
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1);
last_ptr = rep_ptr + 1;
size_t group_len = end - start;
g_string_append_len(replacement_string, state->lhs_str + start, group_len);
}
}
g_string_append_len(replacement_string, last, pos - last);
last = close;
size_t group_len = end - start;
g_string_append_len(replacement_string, state->lhs_str + start, group_len);
}
rep_ptr++;
}
else
rep_ptr++;
pos = close;
}
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr);
g_string_append_len(replacement_string, last, pos - last);
return TRUE;
}

Expand All @@ -117,7 +160,6 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state)
_build_replacement_stirng_with_match_groups(self, state, rep_str);
replacement_string = rep_str->str;
}

do
{
ovector = pcre2_get_ovector_pointer(state->match_data);
Expand Down Expand Up @@ -253,6 +295,13 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args,
return TRUE;
}

static gboolean
_contains_match_grp_ref(gchar *str)
{
gchar *close = NULL;
return _next_matchgrp_ref(str, &close) != NULL;
}

static gboolean
_extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GError **error)
{
Expand All @@ -277,7 +326,9 @@ _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GEr
self->replacement = _extract_subst_replacement_arg(args, error);
if (!self->replacement)
return FALSE;

// turn off group mode if there is no match grp ref due to it's performance impact
if (!_contains_match_grp_ref(self->replacement))
set_flag(&self->flags, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS, FALSE);

return TRUE;
}
Expand Down Expand Up @@ -322,7 +373,8 @@ filterx_function_regexp_subst_new(FilterXFunctionArgs *args, GError **error)
self->super.super.deinit = _subst_deinit;
self->super.super.free_fn = _subst_free;

reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT));
reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT) | FLAG_VAL(
FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS));
if (!_extract_subst_args(self, args, error) ||
!filterx_function_args_check(args, error))
goto error;
Expand Down
37 changes: 35 additions & 2 deletions lib/filterx/tests/test_expr_regexp_subst.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil
if (opts.utf8)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));
if (opts.groups)
if (!opts.groups)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));
filterx_literal_new(filterx_boolean_new(FALSE))));

GError *err = NULL;
FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err);
Expand Down Expand Up @@ -350,6 +350,39 @@ Test(filterx_expr_regexp_subst, regexp_subst_group_subst_without_ref)
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_reference_with_multiple_digits)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result =
_sub("(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})",
"\\12-\\11-\\10-\\9\\8\\7\\6\\5\\4\\3\\2\\1", "010203040506070809101112", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "12-11-10-090807060504030201");
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_do_not_replace_unknown_ref)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result = _sub("(\\d{2})(\\d{2})(\\d{2})",
"\\3\\20\\1", "010203", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "03\\2001");
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_limited_digits_and_zero_prefixes)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result = _sub("(\\w+),(\\w+),(\\w+)", "\\3\\02\\0013.14", "baz,bar,foo", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "foobarbaz3.14");
filterx_object_unref(result);
}

static void
setup(void)
{
Expand Down
12 changes: 8 additions & 4 deletions tests/light/functional_tests/filterx/test_filterx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,9 +2004,11 @@ def test_regexp_subst(config, syslog_ng):
$MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true);
$MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true);
$MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true);
$MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");;
$MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true);
$MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true);
$MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=false);
$MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");
$MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz");
$MSG.multi_digit_grps = regexp_subst("010203040506070809101112", /(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/, "\\10-\\11-\\12");
$MSG.prefixing_zeros = regexp_subst("foobar", /^(.*)$/, "\\001012345");
""",
)
syslog_ng.start(config)
Expand All @@ -2028,7 +2030,9 @@ def test_regexp_subst(config, syslog_ng):
r""""ignore_case":"F!!b!rB!z","""
r""""groups_off":"\\3-\\2-\\1","""
r""""groups_on":"2022-02-25","""
r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n"
r""""mixed_grps":"foo:2022-02-25:bar:baz","""
r""""multi_digit_grps":"10-11-12","""
r""""prefixing_zeros":"foobar012345"}""" + "\n"
)
assert file_true.read_log() == exp

Expand Down

0 comments on commit 8edca70

Please sign in to comment.