diff --git a/.gitignore b/.gitignore index fb844c3..5e4c21e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ _testmain.go *.out .DS_Store + +# Ignore vscode status. +.vscode/ diff --git a/go.mod b/go.mod index 9f7f391..d5db1e4 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/dlclark/regexp2 go 1.13 + +require github.com/stretchr/testify v1.8.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2ec90f7 --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go new file mode 100644 index 0000000..6d5b5d3 --- /dev/null +++ b/regexp_ecma_test.go @@ -0,0 +1,111 @@ +package regexp2_test + +import ( + "testing" + + "github.com/dlclark/regexp2" + "github.com/stretchr/testify/require" +) + +func TestECMA_charset(t *testing.T) { + tests := map[string]struct { + expr string + data string + opt regexp2.RegexOptions + want []string + wantErr string + }{ + "basic": { + expr: `[a-c]`, + data: "abcd", + want: []string{"a", "b", "c"}, + }, + "in-range": { + expr: `[a-\s\b]`, + data: "a-b cd", + want: []string{"a", "-", " "}, + }, + "space": { + expr: `[a-\s]`, + data: "a-b cd", + want: []string{"a", "-", " "}, + }, + "word": { + expr: `[a-\w]`, + data: "a-b cd", + want: []string{"a", "-", "b", "c", "d"}, + }, + "digit": { + expr: `[a-\d]`, + data: "a-b1 cd", + want: []string{"a", "-", "1"}, + }, + "slash-p": { + expr: `[a-\p]`, + data: "a-bq cd", + want: []string{"a", "b", "c", "d"}, + }, + "slash-p-literal": { + expr: `[a-\p{x}]`, + data: "a-bq cdx", + want: []string{"a", "b", "c", "d", "x"}, + }, + "invalid-unicode": { + expr: `[a-\p]`, + opt: regexp2.Unicode, + wantErr: "error parsing regexp: incomplete \\p{X} character escape in `[a-\\p]`", + }, + "invalid-unicode-letter": { + expr: `[a-\p{L}]`, + opt: regexp2.Unicode, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[a-\\p{L}]`", + }, + "invalid-slash-P": { + expr: `[a-\P]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\P in `[a-\\P]`", + }, + "invalid-space": { + expr: `[\s-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\s in `[\\s-z]`", + }, + "invalid-word": { + expr: `[\w-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\w in `[\\w-z]`", + }, + "invalid-digit": { + expr: `[\d-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\d in `[\\d-z]`", + }, + "invalid-point": { + expr: `[\p-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[\\p-z]`", + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + re, err := regexp2.Compile(tt.expr, tt.opt|regexp2.ECMAScript) + if tt.wantErr != "" { + require.EqualError(t, err, tt.wantErr) + return + } + require.NoError(t, err) + + match, err := re.FindStringMatch(tt.data) + require.NoError(t, err) + + var res []string + for match != nil { + for _, g := range match.Groups() { + for _, c := range g.Captures { + res = append(res, c.String()) + } + } + + match, err = re.FindNextMatch(match) + require.NoError(t, err) + } + require.Equal(t, tt.want, res) + }) + } +} diff --git a/regexp_mono_test.go b/regexp_mono_test.go index 372e81c..aa3cf7d 100644 --- a/regexp_mono_test.go +++ b/regexp_mono_test.go @@ -710,8 +710,6 @@ func TestMono_Basics(t *testing.T) { runRegexTrial(t, `.[X](.+)+[X][X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.") runRegexTrial(t, `.[X][X](.+)+[X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.") runRegexTrial(t, `tt+$`, 0, "xxxtt", "Pass. Group[0]=(3,2)") - runRegexTrial(t, `([\d-z]+)`, 0, "a0-za", "Pass. Group[0]=(1,3) Group[1]=(1,3)") - runRegexTrial(t, `([\d-\s]+)`, 0, "a0- z", "Pass. Group[0]=(1,3) Group[1]=(1,3)") runRegexTrial(t, `\GX.*X`, 0, "aaaXbX", "Fail.") runRegexTrial(t, `(\d+\.\d+)`, 0, "3.1415926", "Pass. Group[0]=(0,9) Group[1]=(0,9)") runRegexTrial(t, `(\ba.{0,10}br)`, 0, "have a web browser", "Pass. Group[0]=(5,8) Group[1]=(5,8)") diff --git a/syntax/parser.go b/syntax/parser.go index 9dc6e31..77d6c95 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -75,6 +75,7 @@ const ( // Parser errors ErrUnterminatedComment = "unterminated comment" ErrInvalidCharRange = "invalid character class range" + ErrShorthandCharRange = "cannot create range with shorthand escape sequence \\%c" ErrInvalidRepeatSize = "invalid repeat count" ErrInvalidUTF8 = "invalid UTF-8" ErrCaptureGroupOutOfRange = "capture group number out of range" @@ -1170,14 +1171,19 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { case 'p', 'P': p.moveRight(1) - prop, err := p.parseProperty() + prop, literal, err := p.parseProperty() if err != nil { return nil, err } + cc := &CharSet{} - cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) - if p.useOptionI() { - cc.addLowercase() + if literal { + cc.addChar(ch) + } else { + cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) + if p.useOptionI() { + cc.addLowercase() + } } return newRegexNodeSet(ntSet, p.options, cc), nil @@ -1310,13 +1316,20 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { } // Scans X for \p{X} or \P{X} -func (p *parser) parseProperty() (string, error) { +func (p *parser) parseProperty() (string, bool, error) { + if p.useOptionE() && !p.useOptionU() { + // Unless unicode is enabled ECMA \p is just a literal p + // and \P is invalid depending on the use case, which the + // caller must handle. + return "", true, nil + } + if p.charsRight() < 3 { - return "", p.getErr(ErrIncompleteSlashP) + return "", false, p.getErr(ErrIncompleteSlashP) } ch := p.moveRightGetChar() if ch != '{' { - return "", p.getErr(ErrMalformedSlashP) + return "", false, p.getErr(ErrMalformedSlashP) } startpos := p.textpos() @@ -1330,14 +1343,14 @@ func (p *parser) parseProperty() (string, error) { capname := string(p.pattern[startpos:p.textpos()]) if p.charsRight() == 0 || p.moveRightGetChar() != '}' { - return "", p.getErr(ErrIncompleteSlashP) + return "", false, p.getErr(ErrIncompleteSlashP) } if !isValidUnicodeCat(capname) { - return "", p.getErr(ErrUnknownSlashP, capname) + return "", false, p.getErr(ErrUnknownSlashP, capname) } - return capname, nil + return capname, false, nil } // Returns ReNode type for zero-length assertions with a \ code. @@ -1427,7 +1440,7 @@ func (p *parser) scanCapname() string { return string(p.pattern[startpos:p.textpos()]) } -//Scans contents of [] (not including []'s), and converts to a set. +// Scans contents of [] (not including []'s), and converts to a set. func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { ch := '\x00' chPrev := '\x00' @@ -1461,13 +1474,19 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { closed = true break } - } else if ch == '\\' && p.charsRight() > 0 { switch ch = p.moveRightGetChar(); ch { case 'D', 'd': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) } @@ -1476,7 +1495,14 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'S', 's': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') } @@ -1485,7 +1511,14 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'W', 'w': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') @@ -1494,23 +1527,39 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'p', 'P': if !scanOnly { + prop, literal, err := p.parseProperty() + if err != nil { + return nil, err + } + if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if p.useOptionE() { + if ch == 'P' || !literal { + return nil, p.getErr(ErrShorthandCharRange, ch) + } + } else if !literal { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + + cc.addRange(chPrev, ch) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) + } else if !literal { + cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } - prop, err := p.parseProperty() + } else { + _, _, err := p.parseProperty() if err != nil { return nil, err } - cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) - } else { - p.parseProperty() } continue case '-': if !scanOnly { - cc.addRange(ch, ch) + cc.addChar(ch) } continue @@ -1522,7 +1571,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { return nil, err } fTranslatedChar = true - break // this break will only break out of the switch } } else if ch == '[' { // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. @@ -1579,7 +1627,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { cc.addRange(chPrev, ch) } } - } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { + } else if p.rangeStart() { // this could be the start of a range chPrev = ch inRange = true @@ -1600,7 +1648,10 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } } else { p.moveRight(1) - p.scanCharSet(caseInsensitive, true) + _, err := p.scanCharSet(caseInsensitive, true) + if err != nil { + return nil, err + } } } else { if !scanOnly { @@ -1919,6 +1970,13 @@ func (p *parser) rightMost() bool { return p.currentPos == len(p.pattern) } +// rangeStart returns true if this might be a possible range start, false. +func (p *parser) rangeStart() bool { + return p.charsRight() > 1 && + p.rightChar(0) == '-' && + p.rightChar(1) != ']' +} + // Looks up the slot number for a given name func (p *parser) captureSlotFromName(capname string) int { return p.capnames[capname] diff --git a/testoutput1 b/testoutput1 index fbf63fd..14c61c4 100644 --- a/testoutput1 +++ b/testoutput1 @@ -1995,13 +1995,6 @@ No match aaa No match -/[\d-z]+/ - 12-34z - 0: 12-34z -\= Expect no match - aaa -No match - /\x5c/ \\ 0: \ @@ -5685,17 +5678,6 @@ No match ZABCDEFG 0: 1: - -/^[\d-a]/ - abcde - 0: a - -things - 0: - - 0digit - 0: 0 -\= Expect no match - bcdef -No match /[\s]+/ > \x09\x0a\x0c\x0d\x0b<