Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: ecma ranges with set terminator #55

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ _testmain.go
*.out

.DS_Store

# Ignore vscode status.
.vscode/
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/dlclark/regexp2

go 1.13

require github.com/stretchr/testify v1.8.1
17 changes: 17 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
111 changes: 111 additions & 0 deletions regexp_ecma_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package regexp2_test

import (
"testing"

"github.com/dlclark/regexp2"
"github.com/stretchr/testify/require"
)

func TestECMA_charset(t *testing.T) {
tests := map[string]struct {
expr string
data string
opt regexp2.RegexOptions
want []string
wantErr string
}{
"basic": {
expr: `[a-c]`,
data: "abcd",
want: []string{"a", "b", "c"},
},
"in-range": {
expr: `[a-\s\b]`,
data: "a-b cd",
want: []string{"a", "-", " "},
},
"space": {
expr: `[a-\s]`,
data: "a-b cd",
want: []string{"a", "-", " "},
},
"word": {
expr: `[a-\w]`,
data: "a-b cd",
want: []string{"a", "-", "b", "c", "d"},
},
"digit": {
expr: `[a-\d]`,
data: "a-b1 cd",
want: []string{"a", "-", "1"},
},
"slash-p": {
expr: `[a-\p]`,
data: "a-bq cd",
want: []string{"a", "b", "c", "d"},
},
"slash-p-literal": {
expr: `[a-\p{x}]`,
data: "a-bq cdx",
want: []string{"a", "b", "c", "d", "x"},
},
"invalid-unicode": {
expr: `[a-\p]`,
opt: regexp2.Unicode,
wantErr: "error parsing regexp: incomplete \\p{X} character escape in `[a-\\p]`",
},
"invalid-unicode-letter": {
expr: `[a-\p{L}]`,
opt: regexp2.Unicode,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[a-\\p{L}]`",
},
"invalid-slash-P": {
expr: `[a-\P]`,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\P in `[a-\\P]`",
},
"invalid-space": {
expr: `[\s-z]`,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\s in `[\\s-z]`",
},
"invalid-word": {
expr: `[\w-z]`,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\w in `[\\w-z]`",
},
"invalid-digit": {
expr: `[\d-z]`,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\d in `[\\d-z]`",
},
"invalid-point": {
expr: `[\p-z]`,
wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[\\p-z]`",
},
}

for name, tt := range tests {
t.Run(name, func(t *testing.T) {
re, err := regexp2.Compile(tt.expr, tt.opt|regexp2.ECMAScript)
if tt.wantErr != "" {
require.EqualError(t, err, tt.wantErr)
return
}
require.NoError(t, err)

match, err := re.FindStringMatch(tt.data)
require.NoError(t, err)

var res []string
for match != nil {
for _, g := range match.Groups() {
for _, c := range g.Captures {
res = append(res, c.String())
}
}

match, err = re.FindNextMatch(match)
require.NoError(t, err)
}
require.Equal(t, tt.want, res)
})
}
}
2 changes: 0 additions & 2 deletions regexp_mono_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,8 +710,6 @@ func TestMono_Basics(t *testing.T) {
runRegexTrial(t, `.[X](.+)+[X][X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.")
runRegexTrial(t, `.[X][X](.+)+[X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.")
runRegexTrial(t, `tt+$`, 0, "xxxtt", "Pass. Group[0]=(3,2)")
runRegexTrial(t, `([\d-z]+)`, 0, "a0-za", "Pass. Group[0]=(1,3) Group[1]=(1,3)")
runRegexTrial(t, `([\d-\s]+)`, 0, "a0- z", "Pass. Group[0]=(1,3) Group[1]=(1,3)")
runRegexTrial(t, `\GX.*X`, 0, "aaaXbX", "Fail.")
runRegexTrial(t, `(\d+\.\d+)`, 0, "3.1415926", "Pass. Group[0]=(0,9) Group[1]=(0,9)")
runRegexTrial(t, `(\ba.{0,10}br)`, 0, "have a web browser", "Pass. Group[0]=(5,8) Group[1]=(5,8)")
Expand Down
106 changes: 82 additions & 24 deletions syntax/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ const (
// Parser errors
ErrUnterminatedComment = "unterminated comment"
ErrInvalidCharRange = "invalid character class range"
ErrShorthandCharRange = "cannot create range with shorthand escape sequence \\%c"
ErrInvalidRepeatSize = "invalid repeat count"
ErrInvalidUTF8 = "invalid UTF-8"
ErrCaptureGroupOutOfRange = "capture group number out of range"
Expand Down Expand Up @@ -1170,14 +1171,19 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {

case 'p', 'P':
p.moveRight(1)
prop, err := p.parseProperty()
prop, literal, err := p.parseProperty()
if err != nil {
return nil, err
}

cc := &CharSet{}
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
if p.useOptionI() {
cc.addLowercase()
if literal {
cc.addChar(ch)
} else {
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
if p.useOptionI() {
cc.addLowercase()
}
}

return newRegexNodeSet(ntSet, p.options, cc), nil
Expand Down Expand Up @@ -1310,13 +1316,20 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
}

// Scans X for \p{X} or \P{X}
func (p *parser) parseProperty() (string, error) {
func (p *parser) parseProperty() (string, bool, error) {
if p.useOptionE() && !p.useOptionU() {
// Unless unicode is enabled ECMA \p is just a literal p
// and \P is invalid depending on the use case, which the
// caller must handle.
return "", true, nil
}

if p.charsRight() < 3 {
return "", p.getErr(ErrIncompleteSlashP)
return "", false, p.getErr(ErrIncompleteSlashP)
}
ch := p.moveRightGetChar()
if ch != '{' {
return "", p.getErr(ErrMalformedSlashP)
return "", false, p.getErr(ErrMalformedSlashP)
}

startpos := p.textpos()
Expand All @@ -1330,14 +1343,14 @@ func (p *parser) parseProperty() (string, error) {
capname := string(p.pattern[startpos:p.textpos()])

if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
return "", p.getErr(ErrIncompleteSlashP)
return "", false, p.getErr(ErrIncompleteSlashP)
}

if !isValidUnicodeCat(capname) {
return "", p.getErr(ErrUnknownSlashP, capname)
return "", false, p.getErr(ErrUnknownSlashP, capname)
}

return capname, nil
return capname, false, nil
}

// Returns ReNode type for zero-length assertions with a \ code.
Expand Down Expand Up @@ -1427,7 +1440,7 @@ func (p *parser) scanCapname() string {
return string(p.pattern[startpos:p.textpos()])
}

//Scans contents of [] (not including []'s), and converts to a set.
// Scans contents of [] (not including []'s), and converts to a set.
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
ch := '\x00'
chPrev := '\x00'
Expand Down Expand Up @@ -1461,13 +1474,19 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
closed = true
break
}

} else if ch == '\\' && p.charsRight() > 0 {
switch ch = p.moveRightGetChar(); ch {
case 'D', 'd':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
if !p.useOptionE() {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addChar('-')
cc.addChar(chPrev)
inRange = false
} else if p.rangeStart() {
return nil, p.getErr(ErrShorthandCharRange, ch)
}
cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
}
Expand All @@ -1476,7 +1495,14 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
case 'S', 's':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
if !p.useOptionE() {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addChar('-')
cc.addChar(chPrev)
inRange = false
} else if p.rangeStart() {
return nil, p.getErr(ErrShorthandCharRange, ch)
}
cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
}
Expand All @@ -1485,7 +1511,14 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
case 'W', 'w':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
if !p.useOptionE() {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addChar('-')
cc.addChar(chPrev)
inRange = false
} else if p.rangeStart() {
return nil, p.getErr(ErrShorthandCharRange, ch)
}

cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
Expand All @@ -1494,23 +1527,39 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {

case 'p', 'P':
if !scanOnly {
prop, literal, err := p.parseProperty()
if err != nil {
return nil, err
}

if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
if p.useOptionE() {
if ch == 'P' || !literal {
return nil, p.getErr(ErrShorthandCharRange, ch)
}
} else if !literal {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}

cc.addRange(chPrev, ch)
inRange = false
} else if p.rangeStart() {
return nil, p.getErr(ErrShorthandCharRange, ch)
} else if !literal {
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
}
prop, err := p.parseProperty()
} else {
_, _, err := p.parseProperty()
if err != nil {
return nil, err
}
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
} else {
p.parseProperty()
}

continue

case '-':
if !scanOnly {
cc.addRange(ch, ch)
cc.addChar(ch)
}
continue

Expand All @@ -1522,7 +1571,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
return nil, err
}
fTranslatedChar = true
break // this break will only break out of the switch
}
} else if ch == '[' {
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
Expand Down Expand Up @@ -1579,7 +1627,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
cc.addRange(chPrev, ch)
}
}
} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
} else if p.rangeStart() {
// this could be the start of a range
chPrev = ch
inRange = true
Expand All @@ -1600,7 +1648,10 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
}
} else {
p.moveRight(1)
p.scanCharSet(caseInsensitive, true)
_, err := p.scanCharSet(caseInsensitive, true)
if err != nil {
return nil, err
}
}
} else {
if !scanOnly {
Expand Down Expand Up @@ -1919,6 +1970,13 @@ func (p *parser) rightMost() bool {
return p.currentPos == len(p.pattern)
}

// rangeStart returns true if this might be a possible range start, false.
func (p *parser) rangeStart() bool {
return p.charsRight() > 1 &&
p.rightChar(0) == '-' &&
p.rightChar(1) != ']'
}

// Looks up the slot number for a given name
func (p *parser) captureSlotFromName(capname string) int {
return p.capnames[capname]
Expand Down
Loading