Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LIKE operator support #241

Merged
merged 12 commits into from
Oct 25, 2020
13 changes: 10 additions & 3 deletions sql/parser/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,17 @@ func (p *Parser) parseOperator() (func(lhs, rhs expr.Expr) expr.Expr, scanner.To
p.Unscan()
return expr.Is, op, nil
case scanner.NOT:
if tok, pos, lit := p.ScanIgnoreWhitespace(); tok != scanner.IN {
return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN"}, pos)
tok, pos, lit := p.ScanIgnoreWhitespace()
switch tok {
case scanner.IN:
return expr.NotIn, op, nil
case scanner.LIKE:
return expr.NotLike, op, nil
}
return expr.NotIn, op, nil

return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN, LIKE"}, pos)
case scanner.LIKE:
return expr.Like, op, nil
}

panic(fmt.Sprintf("unknown operator %q", op))
Expand Down
14 changes: 2 additions & 12 deletions sql/query/expr/comparison.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ func (op cmpOp) compare(l, r document.Value) (bool, error) {
func IsComparisonOperator(op Operator) bool {
switch op.(type) {
case eqOp, neqOp, gtOp, gteOp, ltOp, lteOp,
isOp, isNotOp, inOp, notInOp:
isOp, isNotOp, inOp, notInOp, likeOp, notLikeOp:
return true
}

Expand Down Expand Up @@ -540,17 +540,7 @@ func NotIn(a, b Expr) Expr {
}

func (op notInOp) Eval(ctx EvalStack) (document.Value, error) {
v, err := op.inOp.Eval(ctx)
if err != nil {
return v, err
}
if v == trueLitteral {
return falseLitteral, nil
}
if v == falseLitteral {
return trueLitteral, nil
}
return v, nil
return invertBoolResult(op.inOp.Eval)(ctx)
}

func (op notInOp) String() string {
Expand Down
17 changes: 17 additions & 0 deletions sql/query/expr/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,20 @@ type Parentheses struct {
func (p Parentheses) Eval(es EvalStack) (document.Value, error) {
return p.E.Eval(es)
}

func invertBoolResult(f func(ctx EvalStack) (document.Value, error)) func(ctx EvalStack) (document.Value, error) {
return func(ctx EvalStack) (document.Value, error) {
v, err := f(ctx)

if err != nil {
return v, err
}
if v == trueLitteral {
return falseLitteral, nil
}
if v == falseLitteral {
return trueLitteral, nil
}
return v, nil
}
}
60 changes: 60 additions & 0 deletions sql/query/expr/like.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package expr

import (
"errors"
"fmt"
"github.com/genjidb/genji/document"
"github.com/genjidb/genji/sql/query/glob"
"github.com/genjidb/genji/sql/scanner"
)

func like(pattern, text string) bool {
return glob.MatchLike(pattern, text)
}

type likeOp struct {
*simpleOperator
}

// Like creates an expression that evaluates to the result of a LIKE b.
func Like(a, b Expr) Expr {
return &likeOp{&simpleOperator{a, b, scanner.LIKE}}
}

func (op likeOp) Eval(ctx EvalStack) (document.Value, error) {
a, b, err := op.simpleOperator.eval(ctx)
if err != nil {
return nullLitteral, err
}

if a.Type != document.TextValue || b.Type != document.TextValue {
return nullLitteral, errors.New("LIKE operator takes a text")
}

if like(b.V.(string), a.V.(string)) {
return trueLitteral, nil
}

return falseLitteral, nil
}

func (op likeOp) String() string {
return fmt.Sprintf("%v LIKE %v", op.a, op.b)
}

type notLikeOp struct {
likeOp
}

// NotLike creates an expression that evaluates to the result of a NOT LIKE b.
func NotLike(a, b Expr) Expr {
return &notLikeOp{likeOp{&simpleOperator{a, b, scanner.LIKE}}}
}

func (op notLikeOp) Eval(ctx EvalStack) (document.Value, error) {
return invertBoolResult(op.likeOp.Eval)(ctx)
}

func (op notLikeOp) String() string {
return fmt.Sprintf("%v NOT LIKE %v", op.a, op.b)
}
3 changes: 3 additions & 0 deletions sql/query/glob/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package glob implements wildcard pattern matching algorithms for strings.
//
package glob
194 changes: 194 additions & 0 deletions sql/query/glob/like.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// The author disclaims copyright to this source code. In place of
// a legal notice, here is a blessing:
//
// May you do good and not evil.
// May you find forgiveness for yourself and forgive others.
// May you share freely, never taking more than you give.
//
// This is an optimized Go port of the SQLite’s icuLikeCompare routine using backtracking.
// See https://sqlite.org/src/file?name=ext%2Ficu%2Ficu.c&ln=117-195&ci=54b54f02c66c5aea
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for quoting the source 🙏🏼


package glob

import (
"unicode"
"unicode/utf8"
)

const (
matchOne = '_'
matchAll = '%'
matchEsc = '\\'
)

// readRune is like skipRune, but also returns the removed Unicode code point.
func readRune(s string) (rune, string) {
r, size := utf8.DecodeRuneInString(s)
if r == utf8.RuneError && size == 1 {
return rune(s[0]), s[1:]
}
return r, s[size:]
}

// skipRune returns a slice of the string s with the first Unicode code point removed.
func skipRune(s string) string {
_, size := utf8.DecodeRuneInString(s)
return s[size:]
}

// equalFold is strings.EqualFold for individual runes.
func equalFold(sr, tr rune) bool {
// Easy case.
if tr == sr {
return true
}

// Make sr < tr to simplify what follows.
if tr < sr {
tr, sr = sr, tr
}
// Fast check for ASCII.
if tr < utf8.RuneSelf {
// ASCII only, sr/tr must be upper/lower case
return 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A'
}

// General case. SimpleFold(x) returns the next equivalent rune > x
// or wraps around to smaller values.
r := unicode.SimpleFold(sr)
for r != sr && r < tr {
r = unicode.SimpleFold(r)
}
return r == tr
}

// MatchLike reports whether string s matches the SQL LIKE-style glob pattern.
// Supported wildcards are '_' (match any one character) and '%' (match zero
// or more characters). They can be escaped by '\' (escape character).
//
// MatchLike requires pattern to match whole string, not just a substring.
func MatchLike(pattern, s string) bool {
var prevEscape bool

var w, t string // backtracking state

for len(s) != 0 {
// Read (and consume) the next character from the input pattern.
var p rune
if len(pattern) == 0 {
goto backtrack
}
p, pattern = readRune(pattern)

loop:
// There are now 4 possibilities:
//
// 1. p is an unescaped matchAll character “%”,
// 2. p is an unescaped matchOne character “_”,
// 3. p is an unescaped matchEsc character, or
// 4. p is to be handled as an ordinary character
//
if p == matchAll && !prevEscape {
// Case 1.
var c byte

// Skip any matchAll or matchOne characters that follow a
// matchAll. For each matchOne, skip one character in the
// test string.
//
for len(pattern) != 0 {
c = pattern[0]
if c != matchAll && c != matchOne {
break
}
pattern = pattern[1:]

if c != matchOne {
continue
}
if len(s) == 0 {
return false
}
s = skipRune(s)
}

if len(pattern) == 0 {
return true
}

// Save state and match next character.
//
// Since we save t = s and then continue to loop for len(s) ≠ 0,
// the condition len(t) ≠ 0 is always true when we need to backtrack.
//
w, t = pattern, s
} else if p == matchOne && !prevEscape {
// Case 2.
//
// We can either enter loop on normal iteration where len(s) ≠ 0,
// or from backtracking. But we consume all matchOne characters
// before saving backtracking state, so this case is reachable on
// normal iteration only.
//
// That is, we are guaranteed to have input at this point.
//
s = skipRune(s)
} else if p == matchEsc && !prevEscape {
// Case 3.
//
// We can’t reach this case from backtracking to matchAll.
// That implies len(s) ≠ 0 and normal iteration on continue.
// We would either have an escaped character in the pattern,
// or we’ve consumed whole pattern and attempt to backtrack.
// If we can’t backtrack then we are not at the end of input
// since len(s) ≠ 0, and false is returned. That said, it’s
// impossible to exit the loop with truthy prevEscape.
//
prevEscape = true
} else {
// Case 4.
prevEscape = false

var r rune
r, s = readRune(s)
if !equalFold(p, r) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we use direct comparison?

I ran this query on different DBs.

SELECT 'abc' LIKE 'ABC';

Results:

  • PostgreSQL 12.3 returns false.
  • SQLite 3.27.2 returns 1 (true).
  • MySQL 5.7.12 returns 1 (true).
  • Oracle Database 11g returns false (Query: SELECT * FROM DUAL WHERE 'abc' like 'ABC', should be non-empty if true).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should stick with what MySQL and SQLite do.

Copy link
Contributor

@tie tie Oct 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, ideally we should be comparing grapheme clusters using e.g. github.com/clipperhouse/uax29/graphemes and golang.org/x/text/collate.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about we work on collation support in a separate PR? This one is already pretty big and since Genji is not stable yet we can give ourselves time to improve before locking things up.
Unless you think adding it would not take long?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SQLite performs (simple) case folding for character comparison with LIKE operator, and since that’s what our implementation is based on, I think it’s reasonable to follow this behavior too.

A proper Unicode support would definitely take some time to implement given the current state of Unicode support in Go (scattered across third-party libraries with different Unicode versions, and each embeds their own character database copy).

goto backtrack
}
}
continue

backtrack:
// If we can’t backtrack return prevEscape
// to allow escaping end of input.
//
if len(w) == 0 {
return prevEscape && len(s) == 0
}

// Keep the pattern and skip rune in input.
// Note that we only backtrack to matchAll.
//
p, pattern = matchAll, w
prevEscape = false
s = skipRune(t)

goto loop
}

// Check that the rest of the pattern is matchAll.
for i := 0; i < len(pattern); i++ {
if pattern[i] == matchAll {
continue
}

// Allow escaping end of string.
if i+1 == len(pattern) {
if pattern[i] == matchEsc {
return true
}
}

return false
}
return true
}
Loading