From 3c44de2e9002fb1e0b771080ad99a89f5be5c7ec Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Sun, 3 Nov 2024 18:20:49 +0100 Subject: [PATCH] Detect more forms of supportable \G --- scripts/onig-match.js | 8 ++-- spec/helpers/matchers.js | 8 ++-- spec/match-assertion.spec.js | 57 ++++++++++++++++++++++++++--- spec/match-backreference.spec.js | 18 ++++----- spec/match-char-class-range.spec.js | 8 ++-- spec/match-char-set.spec.js | 2 +- src/transform.js | 41 ++++++++++++++++++--- 7 files changed, 109 insertions(+), 33 deletions(-) diff --git a/scripts/onig-match.js b/scripts/onig-match.js index 217f460..41729e6 100644 --- a/scripts/onig-match.js +++ b/scripts/onig-match.js @@ -17,15 +17,15 @@ async function exec([pattern, str]) { const libMatches = []; let libMatch = transpiledRegExpResult(pattern, str, 0); - while (libMatch.result) { + while (libMatch.result !== null) { libMatches.push(libMatch); - libMatch = transpiledRegExpResult(pattern, str, libMatch.index + libMatch.result.length); + libMatch = transpiledRegExpResult(pattern, str, libMatch.index + (libMatch.result.length || 1)); } const onigMatches = []; let onigMatch = await onigurumaResult(pattern, str, 0); - while (onigMatch.result) { + while (onigMatch.result !== null) { onigMatches.push(onigMatch); - onigMatch = await onigurumaResult(pattern, str, onigMatch.index + onigMatch.result.length); + onigMatch = await onigurumaResult(pattern, str, onigMatch.index + (onigMatch.result.length || 1)); } console.log('Pattern:', color('yellow', pattern)); diff --git a/spec/helpers/matchers.js b/spec/helpers/matchers.js index bbb9bb3..afa36be 100644 --- a/spec/helpers/matchers.js +++ b/spec/helpers/matchers.js @@ -5,13 +5,13 @@ function getArgs(actual, expected) { const opts = { pattern: typeof expected === 'string' ? expected : expected.pattern, flags: expected.flags ?? '', - maxTarget: expected.maxTarget ?? null, - minTarget: expected.minTarget ?? null, + maxTestTarget: expected.maxTestTarget ?? null, + minTestTarget: expected.minTestTarget ?? null, }; const targets = ['ES2018', 'ES2024', 'ESNext']; const targeted = targets. - filter(target => !opts.maxTarget || (EsVersion[target] <= EsVersion[opts.maxTarget])). - filter(target => !opts.minTarget || (EsVersion[target] >= EsVersion[opts.minTarget])); + filter(target => !opts.maxTestTarget || (EsVersion[target] <= EsVersion[opts.maxTestTarget])). + filter(target => !opts.minTestTarget || (EsVersion[target] >= EsVersion[opts.minTestTarget])); return { pattern: opts.pattern, flags: opts.flags, diff --git a/spec/match-assertion.spec.js b/spec/match-assertion.spec.js index 0170a3d..da2d793 100644 --- a/spec/match-assertion.spec.js +++ b/spec/match-assertion.spec.js @@ -1,5 +1,6 @@ import {compile, toRegExp} from '../dist/index.mjs'; import {r} from '../src/utils.js'; +import {maxTestTargetForPatternMods} from './helpers/features.js'; import {matchers} from './helpers/matchers.js'; beforeEach(() => { @@ -67,17 +68,63 @@ describe('Assertion', () => { expect('abbcbb'.match(toRegExp(r`\G[ab]`, '', {global: true}))).toEqual(['a', 'b', 'b']); }); - it('should allow redundant assertions', () => { - expect('a').toExactlyMatch(r`\G\Ga`); - expect('b').toExactlyMatch(r`\Ga|\G\Gb`); - }); - // Unsupported: not emulatable without RegExp subclass it('should throw if not used at the start of every top-level alternative', () => { expect(() => compile(r`a\G`)).toThrow(); expect(() => compile(r`\Ga|b`)).toThrow(); expect(() => compile(r`a|\Gb`)).toThrow(); }); + + it('should allow if following a directive', () => { + expect('a').toExactlyMatch(r`\K\Ga`); + expect('a').toExactlyMatch({ + pattern: r`(?i)\Ga`, + maxTestTarget: maxTestTargetForPatternMods, + }); + expect('a').toExactlyMatch({ + pattern: r`(?i)(?m)\Ga`, + maxTestTarget: maxTestTargetForPatternMods, + }); + }); + + it('should allow if following an assertion', () => { + expect('a').toExactlyMatch(r`\A\Ga`); + expect('a').toExactlyMatch(r`\b\Ga`); + expect('a').toExactlyMatch(r`(?=a)\Ga`); + expect('a').toExactlyMatch(r`(? { + expect('a').toExactlyMatch(r`a*\Ga`); + expect('a').toExactlyMatch(r`(a)*\Ga`); + expect('a').toExactlyMatch(r`[a]*\Ga`); + }); + + it('should throw if following a non-0-min quantified token', () => { + expect(() => compile(r`a+\G`)).toThrow(); + expect(() => compile(r`a+?\G`)).toThrow(); + expect(() => compile(r`(a)+\G`)).toThrow(); + }); + + it('should check within groups to determine validity', () => { + expect('a').toExactlyMatch(r`(\Ga)`); + expect('a').toExactlyMatch(r`(?:(?>^(?\Ga)))`); + expect(() => compile(r`(?:(?>a(?\Gb)))`)).toThrow(); + expect('a').toExactlyMatch(r`\Ga|(((\Gb)))`); + expect(() => compile(r`\Ga|(((b\Gc)))`)).toThrow(); + }); + + it('should throw if leading in a non-0-min quantified group', () => { + expect(() => compile(r`(\Ga)+`)).toThrow(); + expect(() => compile(r`(\Ga)+\G`)).toThrow(); + }); + + // Documenting current behavior; supportable + it('should allow redundant assertions', () => { + expect(() => compile(r`\G\Ga`)).toThrow(); + expect(() => compile(r`\Ga|\G\Gb`)).toThrow(); + }); }); describe('string_end', () => { diff --git a/spec/match-backreference.spec.js b/spec/match-backreference.spec.js index f4ef1d1..ed1b970 100644 --- a/spec/match-backreference.spec.js +++ b/spec/match-backreference.spec.js @@ -252,11 +252,11 @@ describe('Backreference', () => { expect('').not.toFindMatch(r`(?(?\k))`); expect('aa').toExactlyMatch({ pattern: r`(?a)\k|(?b\k)`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'b', 'ba', 'bb']).not.toFindMatch({ pattern: r`(?a)\k|(?b\k)`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); }); @@ -265,11 +265,11 @@ describe('Backreference', () => { expect('aba').toExactlyMatch(r`(?a)(?b\k)`); expect(['aa', 'bcb']).toExactlyMatch({ pattern: r`(?a)\k|(?b)(?c\k)`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'bc', 'bca', 'bcc']).not.toFindMatch({ pattern: r`(?a)\k|(?b)(?c\k)`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); }); @@ -297,7 +297,7 @@ describe('Backreference', () => { expect('aab').toExactlyMatch(r`(?a)\k(?b)`); expect('aa').toExactlyMatch({ pattern: r`(?a)\k|(?b)`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); }); @@ -345,19 +345,19 @@ describe('Backreference', () => { // rather than JS logic where they match the empty string expect(['aa', 'bb']).toExactlyMatch({ pattern: r`(?a)\k|(?b)\k`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'b', 'ba']).not.toFindMatch({ pattern: r`(?a)\k|(?b)\k`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); expect(['aa', 'bcb', 'bcc']).toExactlyMatch({ pattern: r`(?a)\k|(?b)(?c)\k`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'bc', 'bca']).not.toFindMatch({ pattern: r`(?a)\k|(?b)(?c)\k`, - maxTarget: maxTestTargetForDuplicateNames, + maxTestTarget: maxTestTargetForDuplicateNames, }); }); diff --git a/spec/match-char-class-range.spec.js b/spec/match-char-class-range.spec.js index b836175..2c36c35 100644 --- a/spec/match-char-class-range.spec.js +++ b/spec/match-char-class-range.spec.js @@ -31,19 +31,19 @@ describe('CharacterClassRange', () => { it('should match unescaped hyphen as literal at intersection boundary', () => { expect('-').toExactlyMatch({ pattern: r`[a-&&\p{Any}]`, - minTarget: minTestTargetForFlagV, + minTestTarget: minTestTargetForFlagV, }); expect('-').toExactlyMatch({ pattern: r`[\w-&&\p{Any}]`, - minTarget: minTestTargetForFlagV, + minTestTarget: minTestTargetForFlagV, }); expect('-').toExactlyMatch({ pattern: r`[\p{Any}&&-a]`, - minTarget: minTestTargetForFlagV, + minTestTarget: minTestTargetForFlagV, }); expect('-').toExactlyMatch({ pattern: r`[\p{Any}&&-\w]`, - minTarget: minTestTargetForFlagV, + minTestTarget: minTestTargetForFlagV, }); }); diff --git a/spec/match-char-set.spec.js b/spec/match-char-set.spec.js index 18a21a6..c0e9b30 100644 --- a/spec/match-char-set.spec.js +++ b/spec/match-char-set.spec.js @@ -17,7 +17,7 @@ describe('CharacterSet', () => { it('should match line feed with flag m disabled', () => { expect('\n').toExactlyMatch({ pattern: r`(?-m)\O`, - maxTarget: maxTestTargetForPatternMods, + maxTestTarget: maxTestTargetForPatternMods, }); }); diff --git a/src/transform.js b/src/transform.js index 5f4cf15..d74a298 100644 --- a/src/transform.js +++ b/src/transform.js @@ -43,6 +43,7 @@ function transform(ast, options) { minTargetEs2024: isMinTarget(opts.bestEffortTarget, 'ES2024'), // Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass subroutineRefMap: new Map(), + supportedGNodes: new Set(), }; traverse({node: ast}, firstPassState, FirstPassVisitor); // Global flags modified by the first pass @@ -105,7 +106,7 @@ const FirstPassVisitor = { }, }, - Assertion({node, parent, key, ast, remove, replaceWith}) { + Assertion({node, ast, remove, replaceWith}, {supportedGNodes}) { const {kind, negate} = node; if (kind === AstAssertionKinds.line_end) { // Onig's only line break char is line feed, unlike JS @@ -114,9 +115,7 @@ const FirstPassVisitor = { // Onig's only line break char is line feed, unlike JS replaceWith(parseFragment(r`(?<=\A|\n)`)); } else if (kind === AstAssertionKinds.search_start) { - // Allows multiple leading `\G`s since the the node is removed. Additional `\G` error - // checking in `Pattern` visitor - if (parent.parent !== ast.pattern || key !== 0) { + if (!supportedGNodes.has(node)) { throw new Error(r`Uses "\G" in a way that's unsupported for conversion to JS`); } ast.flags.sticky = true; @@ -259,14 +258,14 @@ const FirstPassVisitor = { !node.flags.enable && !node.flags.disable && delete node.flags; }, - Pattern({node}) { + Pattern({node}, {supportedGNodes}) { // For `\G` to be accurately emulatable using JS flag y, it must be at (and only at) the start // of every top-level alternative (with complex rules for what determines being at the start). // Additional `\G` error checking in `Assertion` visitor let hasAltWithLeadG = false; let hasAltWithoutLeadG = false; for (const alt of node.alternatives) { - if (alt.elements[0]?.kind === AstAssertionKinds.search_start) { + if (hasLeadingG(alt.elements, supportedGNodes)) { hasAltWithLeadG = true; } else { hasAltWithoutLeadG = true; @@ -642,6 +641,36 @@ function getParentAlternative(node) { return null; } +function hasLeadingG(els, supportedGNodes) { + if (!els.length) { + return false; + } + const firstToConsider = els.find(el => { + return el.kind === AstAssertionKinds.search_start ? + true : + ( el.type !== AstTypes.Directive && + el.type !== AstTypes.Assertion && + !(el.type === AstTypes.Quantifier && !el.min) + ); + }); + if (!firstToConsider) { + return false; + } + if (firstToConsider.kind === AstAssertionKinds.search_start) { + supportedGNodes.add(firstToConsider); + return true; + } + if (firstToConsider.type === AstTypes.Group || firstToConsider.type === AstTypes.CapturingGroup) { + for (const alt of firstToConsider.alternatives) { + if (!hasLeadingG(alt.elements, supportedGNodes)) { + return false; + } + } + return true; + } + return false; +} + function isValidGroupNameJs(name) { // JS group names are more restrictive than Onig; see //