diff --git a/sql/parser/expr.go b/sql/parser/expr.go index 0b6762350..d86c7dd41 100644 --- a/sql/parser/expr.go +++ b/sql/parser/expr.go @@ -126,10 +126,17 @@ func (p *Parser) parseOperator() (func(lhs, rhs expr.Expr) expr.Expr, scanner.To p.Unscan() return expr.Is, op, nil case scanner.NOT: - if tok, pos, lit := p.ScanIgnoreWhitespace(); tok != scanner.IN { - return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN"}, pos) + tok, pos, lit := p.ScanIgnoreWhitespace() + switch tok { + case scanner.IN: + return expr.NotIn, op, nil + case scanner.LIKE: + return expr.NotLike, op, nil } - return expr.NotIn, op, nil + + return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN, LIKE"}, pos) + case scanner.LIKE: + return expr.Like, op, nil } panic(fmt.Sprintf("unknown operator %q", op)) diff --git a/sql/query/expr/comparison.go b/sql/query/expr/comparison.go index 5415d6056..92423ac86 100644 --- a/sql/query/expr/comparison.go +++ b/sql/query/expr/comparison.go @@ -426,7 +426,7 @@ func (op cmpOp) compare(l, r document.Value) (bool, error) { func IsComparisonOperator(op Operator) bool { switch op.(type) { case eqOp, neqOp, gtOp, gteOp, ltOp, lteOp, - isOp, isNotOp, inOp, notInOp: + isOp, isNotOp, inOp, notInOp, likeOp, notLikeOp: return true } @@ -540,17 +540,7 @@ func NotIn(a, b Expr) Expr { } func (op notInOp) Eval(ctx EvalStack) (document.Value, error) { - v, err := op.inOp.Eval(ctx) - if err != nil { - return v, err - } - if v == trueLitteral { - return falseLitteral, nil - } - if v == falseLitteral { - return trueLitteral, nil - } - return v, nil + return invertBoolResult(op.inOp.Eval)(ctx) } func (op notInOp) String() string { diff --git a/sql/query/expr/expr.go b/sql/query/expr/expr.go index d22de3931..fd9491995 100644 --- a/sql/query/expr/expr.go +++ b/sql/query/expr/expr.go @@ -132,3 +132,20 @@ type Parentheses struct { func (p Parentheses) Eval(es EvalStack) (document.Value, error) { return p.E.Eval(es) } + +func invertBoolResult(f func(ctx EvalStack) (document.Value, error)) func(ctx EvalStack) (document.Value, error) { + return func(ctx EvalStack) (document.Value, error) { + v, err := f(ctx) + + if err != nil { + return v, err + } + if v == trueLitteral { + return falseLitteral, nil + } + if v == falseLitteral { + return trueLitteral, nil + } + return v, nil + } +} diff --git a/sql/query/expr/like.go b/sql/query/expr/like.go new file mode 100644 index 000000000..6bbc2c77e --- /dev/null +++ b/sql/query/expr/like.go @@ -0,0 +1,60 @@ +package expr + +import ( + "errors" + "fmt" + "github.com/genjidb/genji/document" + "github.com/genjidb/genji/sql/query/glob" + "github.com/genjidb/genji/sql/scanner" +) + +func like(pattern, text string) bool { + return glob.MatchLike(pattern, text) +} + +type likeOp struct { + *simpleOperator +} + +// Like creates an expression that evaluates to the result of a LIKE b. +func Like(a, b Expr) Expr { + return &likeOp{&simpleOperator{a, b, scanner.LIKE}} +} + +func (op likeOp) Eval(ctx EvalStack) (document.Value, error) { + a, b, err := op.simpleOperator.eval(ctx) + if err != nil { + return nullLitteral, err + } + + if a.Type != document.TextValue || b.Type != document.TextValue { + return nullLitteral, errors.New("LIKE operator takes a text") + } + + if like(b.V.(string), a.V.(string)) { + return trueLitteral, nil + } + + return falseLitteral, nil +} + +func (op likeOp) String() string { + return fmt.Sprintf("%v LIKE %v", op.a, op.b) +} + +type notLikeOp struct { + likeOp +} + +// NotLike creates an expression that evaluates to the result of a NOT LIKE b. +func NotLike(a, b Expr) Expr { + return ¬LikeOp{likeOp{&simpleOperator{a, b, scanner.LIKE}}} +} + +func (op notLikeOp) Eval(ctx EvalStack) (document.Value, error) { + return invertBoolResult(op.likeOp.Eval)(ctx) +} + +func (op notLikeOp) String() string { + return fmt.Sprintf("%v NOT LIKE %v", op.a, op.b) +} diff --git a/sql/query/glob/doc.go b/sql/query/glob/doc.go new file mode 100644 index 000000000..12dfb74be --- /dev/null +++ b/sql/query/glob/doc.go @@ -0,0 +1,3 @@ +// Package glob implements wildcard pattern matching algorithms for strings. +// +package glob diff --git a/sql/query/glob/like.go b/sql/query/glob/like.go new file mode 100644 index 000000000..9454f0c68 --- /dev/null +++ b/sql/query/glob/like.go @@ -0,0 +1,194 @@ +// The author disclaims copyright to this source code. In place of +// a legal notice, here is a blessing: +// +// May you do good and not evil. +// May you find forgiveness for yourself and forgive others. +// May you share freely, never taking more than you give. +// +// This is an optimized Go port of the SQLite’s icuLikeCompare routine using backtracking. +// See https://sqlite.org/src/file?name=ext%2Ficu%2Ficu.c&ln=117-195&ci=54b54f02c66c5aea + +package glob + +import ( + "unicode" + "unicode/utf8" +) + +const ( + matchOne = '_' + matchAll = '%' + matchEsc = '\\' +) + +// readRune is like skipRune, but also returns the removed Unicode code point. +func readRune(s string) (rune, string) { + r, size := utf8.DecodeRuneInString(s) + if r == utf8.RuneError && size == 1 { + return rune(s[0]), s[1:] + } + return r, s[size:] +} + +// skipRune returns a slice of the string s with the first Unicode code point removed. +func skipRune(s string) string { + _, size := utf8.DecodeRuneInString(s) + return s[size:] +} + +// equalFold is strings.EqualFold for individual runes. +func equalFold(sr, tr rune) bool { + // Easy case. + if tr == sr { + return true + } + + // Make sr < tr to simplify what follows. + if tr < sr { + tr, sr = sr, tr + } + // Fast check for ASCII. + if tr < utf8.RuneSelf { + // ASCII only, sr/tr must be upper/lower case + return 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' + } + + // General case. SimpleFold(x) returns the next equivalent rune > x + // or wraps around to smaller values. + r := unicode.SimpleFold(sr) + for r != sr && r < tr { + r = unicode.SimpleFold(r) + } + return r == tr +} + +// MatchLike reports whether string s matches the SQL LIKE-style glob pattern. +// Supported wildcards are '_' (match any one character) and '%' (match zero +// or more characters). They can be escaped by '\' (escape character). +// +// MatchLike requires pattern to match whole string, not just a substring. +func MatchLike(pattern, s string) bool { + var prevEscape bool + + var w, t string // backtracking state + + for len(s) != 0 { + // Read (and consume) the next character from the input pattern. + var p rune + if len(pattern) == 0 { + goto backtrack + } + p, pattern = readRune(pattern) + + loop: + // There are now 4 possibilities: + // + // 1. p is an unescaped matchAll character “%”, + // 2. p is an unescaped matchOne character “_”, + // 3. p is an unescaped matchEsc character, or + // 4. p is to be handled as an ordinary character + // + if p == matchAll && !prevEscape { + // Case 1. + var c byte + + // Skip any matchAll or matchOne characters that follow a + // matchAll. For each matchOne, skip one character in the + // test string. + // + for len(pattern) != 0 { + c = pattern[0] + if c != matchAll && c != matchOne { + break + } + pattern = pattern[1:] + + if c != matchOne { + continue + } + if len(s) == 0 { + return false + } + s = skipRune(s) + } + + if len(pattern) == 0 { + return true + } + + // Save state and match next character. + // + // Since we save t = s and then continue to loop for len(s) ≠ 0, + // the condition len(t) ≠ 0 is always true when we need to backtrack. + // + w, t = pattern, s + } else if p == matchOne && !prevEscape { + // Case 2. + // + // We can either enter loop on normal iteration where len(s) ≠ 0, + // or from backtracking. But we consume all matchOne characters + // before saving backtracking state, so this case is reachable on + // normal iteration only. + // + // That is, we are guaranteed to have input at this point. + // + s = skipRune(s) + } else if p == matchEsc && !prevEscape { + // Case 3. + // + // We can’t reach this case from backtracking to matchAll. + // That implies len(s) ≠ 0 and normal iteration on continue. + // We would either have an escaped character in the pattern, + // or we’ve consumed whole pattern and attempt to backtrack. + // If we can’t backtrack then we are not at the end of input + // since len(s) ≠ 0, and false is returned. That said, it’s + // impossible to exit the loop with truthy prevEscape. + // + prevEscape = true + } else { + // Case 4. + prevEscape = false + + var r rune + r, s = readRune(s) + if !equalFold(p, r) { + goto backtrack + } + } + continue + + backtrack: + // If we can’t backtrack return prevEscape + // to allow escaping end of input. + // + if len(w) == 0 { + return prevEscape && len(s) == 0 + } + + // Keep the pattern and skip rune in input. + // Note that we only backtrack to matchAll. + // + p, pattern = matchAll, w + prevEscape = false + s = skipRune(t) + + goto loop + } + + // Check that the rest of the pattern is matchAll. + for i := 0; i < len(pattern); i++ { + if pattern[i] == matchAll { + continue + } + + // Allow escaping end of string. + if i+1 == len(pattern) { + if pattern[i] == matchEsc { + return true + } + } + + return false + } + return true +} diff --git a/sql/query/glob/like_test.go b/sql/query/glob/like_test.go new file mode 100644 index 000000000..5c7cf4aea --- /dev/null +++ b/sql/query/glob/like_test.go @@ -0,0 +1,202 @@ +package glob + +import ( + "testing" +) + +func TestMatchLike(t *testing.T) { + tests := []struct { + s, pattern string + want bool + }{ + // Empty + {"", "", true}, + {"", "x", false}, + {"x", "", false}, + + // One + {"", "_", false}, + {"x", "_", true}, + {"x", "__", false}, + {"xx", "_", false}, + {"bLah", "bL_h", true}, + {"bLaaa", "bLa_", false}, + {"bLah", "bLa_", true}, + {"bLaH", "_Lah", true}, + {"bLaH", "_LaH", true}, + + // All + {"", "%", true}, + {"abc", "%", true}, + {"", "%", true}, + {"abc", "%%", true}, + + // Any and one + {"x", "%_", true}, + {"", "%_", false}, + + // Escape + {"", "\\", true}, + {"", "%\\", true}, + {"", "\\%", false}, + {"x", "%\\", true}, + {"x", "\\%", false}, + {"x", "_\\", true}, + {"x", "_\\x", false}, + {"%", "\\%", true}, + {"_", "\\_", true}, + {"x", "\\%", false}, + {"x", "\\_", false}, + {"x", "\\x", true}, + {"ab", "a\\", false}, + {"ab", "\\b", false}, + + // Escaping escape + {"", "\\\\", false}, + {"x", "\\\\", false}, + {"\\", "\\\\", true}, + {"\\", "%\\\\", true}, + {"\\", "\\\\%", true}, + {"\\", "_\\\\", false}, + {"\\", "\\\\_", false}, + {"x\\", "\\x\\", false}, + {"\\x", "\\\\x", true}, + + // Exact + {"abc", "abc", true}, + {"aBc", "AbC", true}, + {"abc", "def", false}, + + // Case folding + {"K", "\u212A", true}, // K → k → U+212A + {"\u212A", "k", true}, + + // Invalid UTF-8 + {"\xFF", "\xFF", true}, + {"\xFA", "\xFB", false}, + {"\xFF", "_", true}, + {"\xFF", "\xFF_", false}, + {"\xFF", "%", true}, + {"\xFF", "%\xFF%", true}, + {"\xFF", "x", false}, + + // Prefix + {"abc", "abc%", true}, + {"abcdef", "abc%", true}, + {"abcdef", "def%", false}, + + // Suffix + {"abc", "%abc", true}, + {"defabc", "%abc", true}, + {"defabc", "%def", false}, + + // Contains + {"defabcdef", "%abc%", true}, + {"abcd", "%def%", false}, + {"abc", "b", false}, + + // Complex + {"abc", "ab%d", false}, + {"ABCD", "%B%C%", true}, + {"ABxCxxD", "a%b%c%d", true}, + {"a", "__", false}, + {"ab", "__", true}, + {"abc", "___", true}, + {"abcd", "____", true}, + {"abc", "____", false}, + {"abcd", "_b__", true}, + {"abcd", "_a__", false}, + {"abcd", "__c_", true}, + {"abcd", "__d_", false}, + + // Mixed + {"", "%_", false}, + {"", "_%", false}, + {"a", "%_", true}, + {"a", "%__", false}, + {"ab", "%_", true}, + {"abc", "%_", true}, + {"ab", "_%_", true}, + {"ab", "%_%_%", true}, + {"aab", "%b_", false}, + {"aaaa", "_aa%", true}, + {"aaaa", "%aa_", true}, + {"abc", "_%%_%_", true}, + {"abc", "_%%_%&_", false}, + {"abcd", "_b%__", true}, + {"abcd", "_a%__", false}, + {"abcd", "_%%_c_", true}, + {"abcd", "_%%_d_", false}, + {"abcde", "_b_d%_", true}, + {"abcde", "_%b%_%d%_", true}, + {"abcd", "_%b%c%_", true}, + {"ABxCxxD", "%__B", false}, + {"abBbc", "%b_c", true}, + + // Longer strings + { + "%abc%", + "%%\\%a%b%c\\%%%", + true, + }, + { + "aaabbaabbaab", + "%aabbaa%a%", + true, + }, + { + "abacaaadabacababacaaabadagabacaba", + "%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%", + true, + }, + { + "aaaaaaaaaaaaaaaa", + "%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%", + false, + }, + { + "%a%b%c%", + "%%%%%%%%a%%%%\\%%%%b%%%%\\%%%%c%%%%%%%%", + true, + }, + { + "a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%a%", + "a%a\\%a%a\\%a%a\\%a%a\\%a%a\\%a%a\\%a%a\\%a%a\\%a%", + true, + }, + { + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", + "a%a%a%a%a%a%aa%aaa%a%a%b", + true, + }, + { + "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", + "%a%b%ba%ca%a%aa%aaa%fa%ga%b%", + true, + }, + { + "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", + "%a%b%ba%ca%a%x%aaa%fa%ga%b%", + false, + }, + { + "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", + "%a%b%ba%ca%aaaa%fa%ga%gggg%b%", + false, + }, + { + "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", + "%a%b%ba%ca%aaaa%fa%ga%ggg%b%", + true, + }, + } + + for _, test := range tests { + if got := MatchLike(test.pattern, test.s); got != test.want { + t.Errorf( + "MatchLike(%#v, %#v): expected %#v, got %#v", + test.pattern, test.s, test.want, got, + ) + } + } +} diff --git a/sql/scanner/scanner_test.go b/sql/scanner/scanner_test.go index c194ac8d6..3e70dcf8c 100644 --- a/sql/scanner/scanner_test.go +++ b/sql/scanner/scanner_test.go @@ -54,6 +54,7 @@ func TestScanner_Scan(t *testing.T) { {s: `>=`, tok: scanner.GTE, raw: `>=`}, {s: `IN`, tok: scanner.IN, raw: `IN`}, {s: `IS`, tok: scanner.IS, raw: `IS`}, + {s: `LIKE`, tok: scanner.LIKE, raw: `LIKE`}, // Misc tokens {s: `(`, tok: scanner.LPAREN, raw: `(`}, diff --git a/sql/scanner/token.go b/sql/scanner/token.go index fab98a8e7..a244e099e 100644 --- a/sql/scanner/token.go +++ b/sql/scanner/token.go @@ -56,6 +56,7 @@ const ( GTE // >= IN // IN IS // IS + LIKE // LIKE operatorEnd LPAREN // ( @@ -165,6 +166,7 @@ var tokens = [...]string{ GTE: ">=", IN: "IN", IS: "IS", + LIKE: "LIKE", LPAREN: "(", RPAREN: ")", @@ -238,7 +240,7 @@ func initKeywords() { for tok := keywordBeg + 1; tok < keywordEnd; tok++ { keywords[strings.ToLower(tokens[tok])] = tok } - for _, tok := range []Token{AND, OR, TRUE, FALSE, NULL, IN, IS} { + for _, tok := range []Token{AND, OR, TRUE, FALSE, NULL, IN, IS, LIKE} { keywords[strings.ToLower(tokens[tok])] = tok } } @@ -260,7 +262,7 @@ func (tok Token) Precedence() int { return 2 case IN: return 3 - case EQ, NEQ, EQREGEX, NEQREGEX, LT, LTE, GT, GTE, IS: + case EQ, NEQ, EQREGEX, NEQREGEX, LT, LTE, GT, GTE, IS, LIKE: return 4 case ADD, SUB, BITWISEOR, BITWISEXOR: return 5