Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LIKE operator support #241

Merged
merged 12 commits into from
Oct 25, 2020
13 changes: 10 additions & 3 deletions sql/parser/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,17 @@ func (p *Parser) parseOperator() (func(lhs, rhs expr.Expr) expr.Expr, scanner.To
p.Unscan()
return expr.Is, op, nil
case scanner.NOT:
if tok, pos, lit := p.ScanIgnoreWhitespace(); tok != scanner.IN {
return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN"}, pos)
tok, pos, lit := p.ScanIgnoreWhitespace()
switch tok {
case scanner.IN:
return expr.NotIn, op, nil
case scanner.LIKE:
return expr.NotLike, op, nil
}
return expr.NotIn, op, nil

return nil, 0, newParseError(scanner.Tokstr(tok, lit), []string{"IN, LIKE"}, pos)
case scanner.LIKE:
return expr.Like, op, nil
}

panic(fmt.Sprintf("unknown operator %q", op))
Expand Down
14 changes: 2 additions & 12 deletions sql/query/expr/comparison.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ func (op cmpOp) compare(l, r document.Value) (bool, error) {
func IsComparisonOperator(op Operator) bool {
switch op.(type) {
case eqOp, neqOp, gtOp, gteOp, ltOp, lteOp,
isOp, isNotOp, inOp, notInOp:
isOp, isNotOp, inOp, notInOp, likeOp, notLikeOp:
return true
}

Expand Down Expand Up @@ -540,17 +540,7 @@ func NotIn(a, b Expr) Expr {
}

func (op notInOp) Eval(ctx EvalStack) (document.Value, error) {
v, err := op.inOp.Eval(ctx)
if err != nil {
return v, err
}
if v == trueLitteral {
return falseLitteral, nil
}
if v == falseLitteral {
return trueLitteral, nil
}
return v, nil
return invertBoolResult(op.inOp.Eval)(ctx)
}

func (op notInOp) String() string {
Expand Down
17 changes: 17 additions & 0 deletions sql/query/expr/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,20 @@ type Parentheses struct {
func (p Parentheses) Eval(es EvalStack) (document.Value, error) {
return p.E.Eval(es)
}

func invertBoolResult(f func(ctx EvalStack) (document.Value, error)) func(ctx EvalStack) (document.Value, error) {
return func(ctx EvalStack) (document.Value, error) {
v, err := f(ctx)

if err != nil {
return v, err
}
if v == trueLitteral {
return falseLitteral, nil
}
if v == falseLitteral {
return trueLitteral, nil
}
return v, nil
}
}
60 changes: 60 additions & 0 deletions sql/query/expr/like.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package expr

import (
"errors"
"fmt"
"github.com/genjidb/genji/document"
"github.com/genjidb/genji/sql/query/glob"
"github.com/genjidb/genji/sql/scanner"
)

func like(pattern, text string) bool {
return glob.MatchLike(pattern, text)
}

type likeOp struct {
*simpleOperator
}

// Like creates an expression that evaluates to the result of a LIKE b.
func Like(a, b Expr) Expr {
return &likeOp{&simpleOperator{a, b, scanner.LIKE}}
}

func (op likeOp) Eval(ctx EvalStack) (document.Value, error) {
a, b, err := op.simpleOperator.eval(ctx)
if err != nil {
return nullLitteral, err
}

if a.Type != document.TextValue || b.Type != document.TextValue {
return nullLitteral, errors.New("LIKE operator takes an text")
tdakkota marked this conversation as resolved.
Show resolved Hide resolved
}

if like(b.V.(string), a.V.(string)) {
return trueLitteral, nil
}

return falseLitteral, nil
}

func (op likeOp) String() string {
return fmt.Sprintf("%v LIKE %v", op.a, op.b)
}

type notLikeOp struct {
likeOp
}

// NotLike creates an expression that evaluates to the result of a NOT LIKE b.
func NotLike(a, b Expr) Expr {
return &notLikeOp{likeOp{&simpleOperator{a, b, scanner.LIKE}}}
}

func (op notLikeOp) Eval(ctx EvalStack) (document.Value, error) {
return invertBoolResult(op.likeOp.Eval)(ctx)
}

func (op notLikeOp) String() string {
return fmt.Sprintf("%v NOT LIKE %v", op.a, op.b)
}
140 changes: 140 additions & 0 deletions sql/query/glob/like.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// The author disclaims copyright to this source code. In place of
// a legal notice, here is a blessing:
//
// May you do good and not evil.
// May you find forgiveness for yourself and forgive others.
// May you share freely, never taking more than you give.
//
// https://sqlite.org/src/file?name=ext%2Ficu%2Ficu.c&ln=117-195&m=54b54f02c66c5aea

package glob

import (
"unicode"
"unicode/utf8"
)

const (
matchOne = '_'
matchAll = '%'
matchEsc = '\\'
)

// readRune is like skipRune, but also returns the removed Unicode code point.
func readRune(s string) (rune, string) {
r, size := utf8.DecodeRuneInString(s)
if r == utf8.RuneError && size == 1 {
return rune(s[0]), s[1:]
}
return r, s[size:]
}

// skipRune returns a slice of the string s with the first Unicode code point removed.
func skipRune(s string) string {
_, size := utf8.DecodeRuneInString(s)
return s[size:]
}

// equalFold is strings.EqualFold for individual runes.
func equalFold(sr, tr rune) bool {
// Easy case.
if tr == sr {
return true
}

// Make sr < tr to simplify what follows.
if tr < sr {
tr, sr = sr, tr
}
// Fast check for ASCII.
if tr < utf8.RuneSelf {
// ASCII only, sr/tr must be upper/lower case
return 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A'
}

// General case. SimpleFold(x) returns the next equivalent rune > x
// or wraps around to smaller values.
r := unicode.SimpleFold(sr)
for r != sr && r < tr {
r = unicode.SimpleFold(r)
}
return r == tr
}

// MatchLike reports whether string s matches the SQL LIKE-style glob pattern.
// Supported wildcards are '_' (match any one character) and '%' (match zero
// or more characters). They can be escaped by '\' (escape character).
//
// MatchLike requires pattern to match whole string, not just a substring.
func MatchLike(pattern, s string) bool {
var prevEscape bool

for len(pattern) != 0 {
// Read (and consume) the next character from the input pattern.
var p rune
p, pattern = readRune(pattern)

// There are now 4 possibilities:
//
// 1. p is an unescaped match-all character "%",
// 2. p is an unescaped match-one character "_",
// 3. p is an unescaped escape character, or
tdakkota marked this conversation as resolved.
Show resolved Hide resolved
// 4. p is to be handled as an ordinary character
//
if p == matchAll && !prevEscape {
// Case 1.
var c byte

// Skip any matchAll or matchOne characters that follow a
// matchAll. For each matchOne, skip one character in the
// test string.
//
for len(pattern) != 0 {
c = pattern[0]
if c != matchAll && c != matchOne {
break
}
pattern = pattern[1:]

if c != matchOne {
continue
}
if len(s) == 0 {
return false
}
s = skipRune(s)
}

if len(pattern) == 0 {
return true
}

for len(s) != 0 {
if MatchLike(pattern, s) {
return true
}
s = skipRune(s)
}
return false
} else if p == matchOne && !prevEscape {
// Case 2.
if len(s) == 0 {
return false
}
tdakkota marked this conversation as resolved.
Show resolved Hide resolved
s = skipRune(s)
} else if p == matchEsc && !prevEscape {
// Case 3.
prevEscape = true
} else {
// Case 4.
var r rune
r, s = readRune(s)
if !equalFold(p, r) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we use direct comparison?

I ran this query on different DBs.

SELECT 'abc' LIKE 'ABC';

Results:

  • PostgreSQL 12.3 returns false.
  • SQLite 3.27.2 returns 1 (true).
  • MySQL 5.7.12 returns 1 (true).
  • Oracle Database 11g returns false (Query: SELECT * FROM DUAL WHERE 'abc' like 'ABC', should be non-empty if true).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should stick with what MySQL and SQLite do.

Copy link
Contributor

@tie tie Oct 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, ideally we should be comparing grapheme clusters using e.g. github.com/clipperhouse/uax29/graphemes and golang.org/x/text/collate.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about we work on collation support in a separate PR? This one is already pretty big and since Genji is not stable yet we can give ourselves time to improve before locking things up.
Unless you think adding it would not take long?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SQLite performs (simple) case folding for character comparison with LIKE operator, and since that’s what our implementation is based on, I think it’s reasonable to follow this behavior too.

A proper Unicode support would definitely take some time to implement given the current state of Unicode support in Go (scattered across third-party libraries with different Unicode versions, and each embeds their own character database copy).

return false
}
prevEscape = false
}
}

return len(s) == 0
}
71 changes: 71 additions & 0 deletions sql/query/glob/like_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package glob

import (
"testing"
)

func TestMatchLike(t *testing.T) {
tests := []struct {
s, pattern string
want bool
}{
// Empty
{"", "", true},
{"abc", "", false},

// One
{"x", "_", true},
{"xx", "_", false},
{"", "_", false},

// Any
{"abc", "%", true},
{"", "%", true},

// Escape
{"%", "\\%", true},
{"_", "\\_", true},
{"x", "\\%", false},
{"x", "\\_", false},
{"x", "\\x", true},

// Escaping escape
{"\\", "\\\\", true},
{"\\", "\\\\%", true},
{"\\", "\\\\_", false},
{"\\x", "\\\\x", true},

// Exact
{"abc", "abc", true},
{"aBc", "AbC", false},
{"abc", "def", false},

// Prefix
{"abcdef", "abc%", true},
{"abcdef", "def%", false},

// Suffix
{"defabc", "%abc", true},
{"defabc", "%def", false},

// Contains
{"defabcdef", "%abc%", true},
{"abcd", "%def%", false},
{"abc", "b", false},

// Complex
{"ABCD", "%B%C%", true},
{"ABCD", "_%B%C%_", true},
{"ABxCxxD", "a%b%c%d", true},
{"ABxCxxD", "%__B", false},
}

for _, test := range tests {
if got := MatchLike(test.pattern, test.s); got != test.want {
t.Errorf(
"MatchLike(%#v, %#v): expected %#v, got %#v",
test.pattern, test.s, test.want, got,
)
}
}
}
1 change: 1 addition & 0 deletions sql/scanner/scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ func TestScanner_Scan(t *testing.T) {
{s: `>=`, tok: scanner.GTE, raw: `>=`},
{s: `IN`, tok: scanner.IN, raw: `IN`},
{s: `IS`, tok: scanner.IS, raw: `IS`},
{s: `LIKE`, tok: scanner.LIKE, raw: `LIKE`},

// Misc tokens
{s: `(`, tok: scanner.LPAREN, raw: `(`},
Expand Down
6 changes: 4 additions & 2 deletions sql/scanner/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ const (
GTE // >=
IN // IN
IS // IS
LIKE // LIKE
operatorEnd

LPAREN // (
Expand Down Expand Up @@ -165,6 +166,7 @@ var tokens = [...]string{
GTE: ">=",
IN: "IN",
IS: "IS",
LIKE: "LIKE",

LPAREN: "(",
RPAREN: ")",
Expand Down Expand Up @@ -238,7 +240,7 @@ func initKeywords() {
for tok := keywordBeg + 1; tok < keywordEnd; tok++ {
keywords[strings.ToLower(tokens[tok])] = tok
}
for _, tok := range []Token{AND, OR, TRUE, FALSE, NULL, IN, IS} {
for _, tok := range []Token{AND, OR, TRUE, FALSE, NULL, IN, IS, LIKE} {
keywords[strings.ToLower(tokens[tok])] = tok
}
}
Expand All @@ -260,7 +262,7 @@ func (tok Token) Precedence() int {
return 2
case IN:
return 3
case EQ, NEQ, EQREGEX, NEQREGEX, LT, LTE, GT, GTE, IS:
case EQ, NEQ, EQREGEX, NEQREGEX, LT, LTE, GT, GTE, IS, LIKE:
return 4
case ADD, SUB, BITWISEOR, BITWISEXOR:
return 5
Expand Down