Skip to content

Commit

Permalink
[query] Fix matching of disjunction including empty string (#4258)
Browse files Browse the repository at this point in the history
  • Loading branch information
linasm authored Nov 4, 2023
1 parent 9142ce5 commit d4b0fe7
Show file tree
Hide file tree
Showing 7 changed files with 386 additions and 85 deletions.
23 changes: 9 additions & 14 deletions src/m3ninx/index/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
package index

import (
"context"
"fmt"
re "regexp"
"regexp/syntax"
Expand All @@ -37,7 +36,6 @@ var (
// dotStartCompiledRegex is a CompileRegex that matches any input.
// NB: It can be accessed through DotStartCompiledRegex().
dotStarCompiledRegex CompiledRegex
cacheContext = context.Background()
)

func init() {
Expand Down Expand Up @@ -146,18 +144,15 @@ func CompileRegex(r []byte) (CompiledRegex, error) {
// Issue (a): Vellum does not allow regexps which use characters '^', or '$'.
// To address this issue, we strip these characters from appropriate locations in the parsed syntax.Regexp
// for Vellum's RE.
vellumRe, err := ensureRegexpUnanchored(reAst)
vellumRe, err := EnsureRegexpUnanchored(reAst)
if err != nil {
return CompiledRegex{}, fmt.Errorf("unable to create FST re: %v", err)
}

// Issue (b): Vellum treats every regular expression as anchored, where as the map-backed segment does not.
// To address this issue, we ensure that every incoming regular expression is modified to be anchored
// when querying the map-backed segment, and isn't anchored when querying Vellum's RE.
simpleRe, err := ensureRegexpAnchored(vellumRe)
if err != nil {
return CompiledRegex{}, fmt.Errorf("unable to create map re: %v", err)
}
simpleRe := EnsureRegexpAnchored(vellumRe)

simpleRE, err := re.Compile(simpleRe.String())
if err != nil {
Expand Down Expand Up @@ -191,10 +186,10 @@ func parseRegexp(re string) (*syntax.Regexp, error) {
return syntax.Parse(re, syntax.Perl)
}

// ensureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp,
// to ensure every input regular expression is converted to it's equivalent anchored regular expression.
// EnsureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp,
// to ensure every input regular expression is converted to its equivalent anchored regular expression.
// NB: assumes input regexp AST is un-anchored.
func ensureRegexpAnchored(unanchoredRegexp *syntax.Regexp) (*syntax.Regexp, error) {
func EnsureRegexpAnchored(unanchoredRegexp *syntax.Regexp) *syntax.Regexp {
ast := &syntax.Regexp{
Op: syntax.OpConcat,
Flags: syntax.Perl,
Expand All @@ -210,13 +205,13 @@ func ensureRegexpAnchored(unanchoredRegexp *syntax.Regexp) (*syntax.Regexp, erro
},
},
}
return simplify(ast.Simplify()), nil
return simplify(ast.Simplify())
}

// ensureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp,
// to ensure every input regular expression is converted to it's equivalent un-anchored regular expression
// EnsureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp,
// to ensure every input regular expression is converted to its equivalent un-anchored regular expression
// assuming the entire input is matched.
func ensureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) {
func EnsureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) {
r, _, err := ensureRegexpUnanchoredHelper(parsed, true, true)
if err != nil {
return nil, err
Expand Down
5 changes: 2 additions & 3 deletions src/m3ninx/index/regexp_prop_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,9 @@ func TestRegexpCompilationProperty(t *testing.T) {
func compileRegexp(x string, t *testing.T) *regexp.Regexp {
ast, err := parseRegexp(x)
require.NoError(t, err)
astp, err := ensureRegexpUnanchored(ast)
require.NoError(t, err)
ast2p, err := ensureRegexpAnchored(astp)
astp, err := EnsureRegexpUnanchored(ast)
require.NoError(t, err)
ast2p := EnsureRegexpAnchored(astp)
re, err := regexp.Compile(ast2p.String())
require.NoError(t, err)
return re
Expand Down
91 changes: 45 additions & 46 deletions src/m3ninx/index/regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,162 +57,162 @@ func TestEnsureRegexpUnachoredee(t *testing.T) {

func TestEnsureRegexpUnachored(t *testing.T) {
testCases := []testCase{
testCase{
{
name: "naked ^",
input: "^",
expectedOutput: "emp{}",
},
testCase{
{
name: "naked $",
input: "$",
expectedOutput: "emp{}",
},
testCase{
{
name: "empty string ^$",
input: "^$",
expectedOutput: "cat{}",
},
testCase{
{
name: "invalid naked concat ^$",
input: "$^",
expectedOutput: "cat{eot{}bot{}}",
},
testCase{
{
name: "simple case of ^",
input: "^abc",
expectedOutput: "str{abc}",
},
testCase{
{
name: "simple case of $",
input: "abc$",
expectedOutput: "str{abc}",
},
testCase{
{
name: "simple case of both ^ & $",
input: "^abc$",
expectedOutput: "str{abc}",
},
testCase{
{
name: "weird case of internal ^",
input: "^a^bc$",
expectedOutput: "cat{lit{a}bot{}str{bc}}",
},
testCase{
{
name: "weird case of internal $",
input: "^a$bc$",
expectedOutput: "cat{lit{a}eot{}str{bc}}",
},
testCase{
{
name: "alternate of sub expressions with only legal ^ and $",
input: "(?:^abc$)|(?:^xyz$)",
expectedOutput: "alt{str{abc}str{xyz}}",
},
testCase{
{
name: "concat of sub expressions with only legal ^ and $",
input: "(^abc$)(?:^xyz$)",
expectedOutput: "cat{cap{cat{str{abc}eot{}}}bot{}str{xyz}}",
},
testCase{
{
name: "alternate of sub expressions with illegal ^ and $",
input: "(?:^a$bc$)|(?:^xyz$)",
expectedOutput: "alt{cat{lit{a}eot{}str{bc}}str{xyz}}",
},
testCase{
{
name: "concat of sub expressions with illegal ^ and $",
input: "(?:^a$bc$)(?:^xyz$)",
expectedOutput: "cat{lit{a}eot{}str{bc}eot{}bot{}str{xyz}}",
},
testCase{
{
name: "question mark case both boundaries success",
input: "(?:^abc$)?",
expectedOutput: "que{str{abc}}",
},
testCase{
{
name: "question mark case only ^",
input: "(?:^abc)?",
expectedOutput: "que{str{abc}}",
},
testCase{
{
name: "question mark case only $",
input: "(?:abc$)?",
expectedOutput: "que{str{abc}}",
},
testCase{
{
name: "question concat case $",
input: "abc$?",
expectedOutput: "str{abc}",
},
testCase{
{
name: "star mark case both boundaries success",
input: "(?:^abc$)*",
expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}eot{}}}}",
},
testCase{
{
name: "star mark case only ^",
input: "(?:^abc)*",
expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}}}}",
},
testCase{
{
name: "star mark case only $",
input: "(?:abc$)*",
expectedOutput: "cat{que{str{abc}}star{cat{str{abc}eot{}}}}",
},
testCase{
{
name: "star concat case $",
input: "abc$*",
expectedOutput: "cat{str{abc}star{eot{}}}",
},
testCase{
{
name: "star concat case ^",
input: "^*abc",
expectedOutput: "cat{star{bot{}}str{abc}}",
},
testCase{
{
name: "plus mark case both boundaries success",
input: "(?:^abc$)+",
expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}eot{}}}}",
},
testCase{
{
name: "plus mark case with capturing group",
input: "(^abc$)+",
expectedOutput: "cat{cap{str{abc}}star{cap{cat{bot{}str{abc}eot{}}}}}",
},
testCase{
{
name: "plus mark case only ^",
input: "(?:^abc)+",
expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}}}}",
},
testCase{
{
name: "plus mark case only $",
input: "(?:abc$)+",
expectedOutput: "cat{str{abc}star{cat{str{abc}eot{}}}}",
},
testCase{
{
name: "plus concat case $",
input: "abc$+",
expectedOutput: "cat{str{abc}star{eot{}}}",
},
testCase{
{
name: "plus concat case ^",
input: "^+abc",
expectedOutput: "cat{star{bot{}}str{abc}}",
},
testCase{
{
name: "repeat case both boundaries success",
input: "(?:^abc$){3,4}",
expectedOutput: "cat{str{abc}rep{2,3 cat{bot{}str{abc}eot{}}}}",
},
testCase{
{
name: "repeat case unbounded max",
input: "(?:^abc$){3,}",
expectedOutput: "cat{str{abc}rep{2,-1 cat{bot{}str{abc}eot{}}}}",
},
testCase{
{
name: "repeat case unbounded max with 1 min",
input: "(?:^abc$){1,2}",
expectedOutput: "cat{str{abc}rep{0,1 cat{bot{}str{abc}eot{}}}}",
},
testCase{
{
name: "repeat case unbounded max with 0 min",
input: "(?:^abc$){0,2}",
expectedOutput: "rep{0,2 cat{bot{}str{abc}eot{}}}",
Expand All @@ -222,7 +222,7 @@ func TestEnsureRegexpUnachored(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
re, err := parseRegexp(tc.input)
require.NoError(t, err)
parsed, err := ensureRegexpUnanchored(re)
parsed, err := EnsureRegexpUnanchored(re)
require.NoError(t, err)
assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
})
Expand All @@ -231,57 +231,57 @@ func TestEnsureRegexpUnachored(t *testing.T) {

func TestEnsureRegexpAnchored(t *testing.T) {
testCases := []testCase{
testCase{
{
name: "naked ^",
input: "(?:)",
expectedOutput: "cat{bot{}eot{\\z}}",
},
testCase{
{
name: "invalid naked concat ^$",
input: "$^",
expectedOutput: "cat{bot{}eot{}bot{}eot{\\z}}",
},
testCase{
{
name: "simple case of literal",
input: "abc",
expectedOutput: "cat{bot{}str{abc}eot{\\z}}",
},
testCase{
{
name: "weird case of internal ^",
input: "a^bc",
expectedOutput: "cat{bot{}lit{a}bot{}str{bc}eot{\\z}}",
},
testCase{
{
name: "weird case of internal $",
input: "a$bc",
expectedOutput: "cat{bot{}lit{a}eot{}str{bc}eot{\\z}}",
},
testCase{
{
name: "alternate of sub expressions with only legal ^ and $",
input: "abc|xyz",
expectedOutput: "cat{bot{}alt{str{abc}str{xyz}}eot{\\z}}",
},
testCase{
{
name: "concat of sub expressions with only legal ^ and $",
input: "(?:abc)(?:xyz)",
expectedOutput: "cat{bot{}str{abcxyz}eot{\\z}}",
},
testCase{
{
name: "question mark case both boundaries success",
input: "(?:abc)?",
expectedOutput: "cat{bot{}que{str{abc}}eot{\\z}}",
},
testCase{
{
name: "star mark case both boundaries success",
input: "(?:abc)*",
expectedOutput: "cat{bot{}star{str{abc}}eot{\\z}}",
},
testCase{
{
name: "plus mark case both boundaries success",
input: "(?:abc)+",
expectedOutput: "cat{bot{}plus{str{abc}}eot{\\z}}",
},
testCase{
{
name: "repeat case both boundaries success",
input: "(?:abc){3,4}",
expectedOutput: "cat{bot{}str{abc}str{abc}str{abc}que{str{abc}}eot{\\z}}",
Expand All @@ -291,8 +291,7 @@ func TestEnsureRegexpAnchored(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
re, err := parseRegexp(tc.input)
require.NoError(t, err)
parsed, err := ensureRegexpAnchored(re)
require.NoError(t, err)
parsed := EnsureRegexpAnchored(re)
assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
})
}
Expand Down
Loading

0 comments on commit d4b0fe7

Please sign in to comment.