Skip to content

Commit

Permalink
Multiplexing across alternation paths
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Oct 30, 2024
1 parent 9f2677e commit a5c8f18
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 46 deletions.
38 changes: 19 additions & 19 deletions dist/index.min.js

Large diffs are not rendered by default.

55 changes: 49 additions & 6 deletions spec/match-backreference.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ describe('Backreference', () => {
it('should not match if it references a not-yet-closed group', () => {
expect('').not.toFindMatch(r`(\1)`);
expect('').not.toFindMatch(r`(((\2)))`);
expect('aa').not.toFindMatch(r`(a\1)`);
expect(['a', 'aa']).not.toFindMatch(r`(a\1)`);
});

it('should throw if not enough captures to the left', () => {
Expand Down Expand Up @@ -65,6 +65,11 @@ describe('Backreference', () => {
expect(['1231', '1232']).not.toFindMatch(r`(([123]))\g<1>\g<1>\2`);
});

it('should continue to reference the correct group when subroutines add captures', () => {
expect('aabb').toExactlyMatch(r`(a)\g<1>(b)\2`);
expect('aaba').not.toFindMatch(r`(a)\g<1>(b)\2`);
});

it('should track independent captures when used in a group referenced by a subroutine', () => {
expect(['aaaa', 'aabb', 'bbaa', 'bbbb']).toExactlyMatch(r`((\w)\2)\g<1>`);
expect(['aaba', 'bbab']).not.toFindMatch(r`((\w)\2)\g<1>`);
Expand All @@ -80,7 +85,7 @@ describe('Backreference', () => {
it('should not match if it references a not-yet-closed group', () => {
expect('').not.toFindMatch(r`(\k<1>)`);
expect('').not.toFindMatch(r`(((\k<2>)))`);
expect('aa').not.toFindMatch(r`(a\k<1>)`);
expect(['a', 'aa']).not.toFindMatch(r`(a\k<1>)`);
});

it('should throw if not enough captures to the left', () => {
Expand Down Expand Up @@ -141,6 +146,11 @@ describe('Backreference', () => {
expect(['1231', '1232']).not.toFindMatch(r`(([123]))\g<1>\g<1>\k<2>`);
});

it('should continue to reference the correct group when subroutines add captures', () => {
expect('aabb').toExactlyMatch(r`(a)\g<1>(b)\k<2>`);
expect('aaba').not.toFindMatch(r`(a)\g<1>(b)\k<2>`);
});

it('should track independent captures when used in a group referenced by a subroutine', () => {
expect(['aaaa', 'aabb', 'bbaa', 'bbbb']).toExactlyMatch(r`((\w)\k<2>)\g<1>`);
expect(['aaba', 'bbab']).not.toFindMatch(r`((\w)\k<2>)\g<1>`);
Expand All @@ -156,7 +166,7 @@ describe('Backreference', () => {
it('should not match if it references a not-yet-closed group', () => {
expect('').not.toFindMatch(r`(\k<-1>)`);
expect('').not.toFindMatch(r`(((\k<-2>)))`);
expect('aa').not.toFindMatch(r`(a\k<-1>)`);
expect(['a', 'aa']).not.toFindMatch(r`(a\k<-1>)`);
});

it('should throw if not enough captures to the left', () => {
Expand Down Expand Up @@ -222,6 +232,11 @@ describe('Backreference', () => {
expect(['1231', '1232']).not.toFindMatch(r`(([123]))\g<1>\g<-2>\k<-1>`);
});

it('should continue to reference the correct group when subroutines add captures', () => {
expect('aabb').toExactlyMatch(r`(a)\g<1>(b)\k<-1>`);
expect('aaba').not.toFindMatch(r`(a)\g<1>(b)\k<-1>`);
});

it('should track independent captures when used in a group referenced by a subroutine', () => {
expect(['aaaa', 'aabb', 'bbaa', 'bbbb']).toExactlyMatch(r`((\w)\k<-1>)\g<1>`);
expect(['aaba', 'bbab']).not.toFindMatch(r`((\w)\k<-1>)\g<1>`);
Expand All @@ -239,7 +254,17 @@ describe('Backreference', () => {
it('should not match if it references a not-yet-closed group', () => {
expect('').not.toFindMatch(r`(?<a>\k<a>)`);
expect('').not.toFindMatch(r`(?<a>(?<b>(?<c>\k<b>)))`);
expect('aa').not.toFindMatch(r`(?<a>a\k<a>)`);
expect(['a', 'aa']).not.toFindMatch(r`(?<a>a\k<a>)`);
expect('').not.toFindMatch(r`(?<a>(?<a>\k<a>))`);
expect('aa').toExactlyMatch(r`(?<n>a)\k<n>|(?<n>b\k<n>)`);
expect(['a', 'b', 'ba', 'bb']).not.toFindMatch(r`(?<n>a)\k<n>|(?<n>b\k<n>)`);
});

it('should only preclude the not-yet-closed groups when multiplexing', () => {
expect('aa').toExactlyMatch(r`(?<a>a)(?<a>\k<a>)`);
expect('aba').toExactlyMatch(r`(?<n>a)(?<n>b\k<n>)`);
expect(['aa', 'bcb']).toExactlyMatch(r`(?<n>a)\k<n>|(?<n>b)(?<n>c\k<n>)`);
expect(['a', 'bc', 'bca', 'bcc']).not.toFindMatch(r`(?<n>a)\k<n>|(?<n>b)(?<n>c\k<n>)`);
});

it('should throw if capture is not to the left', () => {
Expand Down Expand Up @@ -273,12 +298,16 @@ describe('Backreference', () => {

it('should multiplex for duplicate names to the left', () => {
expect(['aba', 'abb']).toExactlyMatch(r`(?<n>a)(?<n>b)\k<n>`);
expect(['aba', 'abb', 'ab']).toExactlyMatch(r`(?<n>a)(?<n>b)\k<n>?`);
expect(['abca', 'abcb', 'abcc']).toExactlyMatch(r`(?<n>a)(?<n>b)(?<n>c)\k<n>`);
expect(['aba', 'abb']).toExactlyMatch(r`(?<n>\w)(?<n>\w)\k<n>`);
expect(['aab', 'abc']).not.toFindMatch(r`(?<n>\w)(?<n>\w)\k<n>`);
});

// TODO: Multiplexing changes across multiple backrefs as more duplicate groups are added
it('should increase multiplexing as duplicate names are added to the left', () => {
expect(['aaba', 'aabb']).toExactlyMatch(r`(?<n>a)\k<n>(?<n>b)\k<n>`);
expect(['abba', 'abbb']).not.toFindMatch(r`(?<n>a)\k<n>(?<n>b)\k<n>`);
});

it('should ref the most recent of a capture/subroutine set without multiplexing', () => {
expect('abb').toExactlyMatch(r`(?<a>\w)\g<a>\k<a>`);
Expand All @@ -287,7 +316,12 @@ describe('Backreference', () => {
expect('aba').not.toFindMatch(r`\g<a>(?<a>\w)\k<a>`);
});

it('should multiplex for duplicate names to the left but use only the most recent of an indirect capture/subroutine set', () => {
it('should continue to reference the correct group when subroutines add captures', () => {
expect('aabb').toExactlyMatch(r`(?<a>a)\g<a>(?<b>b)\k<b>`);
expect('aaba').not.toFindMatch(r`(?<a>a)\g<a>(?<b>b)\k<b>`);
});

it('should multiplex for duplicate names to the left but only use the most recent of an indirect capture/subroutine set', () => {
expect([ // All possible matches
'1010', '1011', '1020', '1022', '2010', '2011', '2020', '2022',
]).toExactlyMatch(r`(?<a>(?<b>[12]))(?<b>0)\g<a>\k<b>`);
Expand All @@ -300,6 +334,15 @@ describe('Backreference', () => {
expect(['12301', '12302']).not.toFindMatch(r`(?<a>(?<b>[123]))\g<a>\g<a>(?<b>0)\k<b>`);
});

it('should preclude groups not in the alternation path when multiplexing', () => {
// This enforces Oniguruma logic where backrefs to nonparticipating groups fail to match
// rather than JS logic where they match the empty string
expect(['aa', 'bb']).toExactlyMatch(r`(?<n>a)\k<n>|(?<n>b)\k<n>`);
expect(['a', 'b', 'ba']).not.toFindMatch(r`(?<n>a)\k<n>|(?<n>b)\k<n>`);
expect(['aa', 'bcb', 'bcc']).toExactlyMatch(r`(?<n>a)\k<n>|(?<n>b)(?<n>c)\k<n>`);
expect(['a', 'bc', 'bca']).not.toFindMatch(r`(?<n>a)\k<n>|(?<n>b)(?<n>c)\k<n>`);
});

it('should track independent captures when used in a group referenced by a subroutine', () => {
expect(['aaaa', 'aabb', 'bbaa', 'bbbb']).toExactlyMatch(r`(?<a>(?<b>\w)\k<b>)\g<a>`);
expect(['aaba', 'bbab']).not.toFindMatch(r`(?<a>(?<b>\w)\k<b>)\g<a>`);
Expand Down
3 changes: 2 additions & 1 deletion src/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,8 @@ function genCapturingGroup({name, number, alternatives}, state, gen) {
if (name) {
if (state.groupNames.has(name)) {
if (!state.useDuplicateNames) {
// Keep the name only in the first alternation path that used it
// Keep the name only in the first alternation path that used it; the transformer already
// stripped all but the last duplicate name per alternation path
name = null;
}
} else {
Expand Down
45 changes: 25 additions & 20 deletions src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,6 @@ const FirstPassVisitor = {
// Kinds `lookahead` and `lookbehind` also don't need transformation
},

Backreference({node, replaceWith}) {
// Convert backrefs reffing a not-yet-closed group to `(?!)`; they can't match in Onig but
// match an empty string in JS
const unclosed = getAllParents(node, node => node.type === AstTypes.CapturingGroup);
if (unclosed.some(capture => capture.number === node.ref || capture.name === node.ref)) {
replaceWith(createLookaround({negate: true}));
}
},

CapturingGroup({node}, {subroutineRefMap}) {
const {name, number} = node;
if (name && !isValidGroupNameJs(name)) {
Expand Down Expand Up @@ -378,6 +369,7 @@ const SecondPassVisitor = {
const ref = name ?? number;
// Has value if we're within a subroutine expansion
const origin = groupOriginByCopy.get(node);
const parentAlt = getParentAlternative(node);

// ## Handle recursion; runs after subroutine expansion
// TODO: Can this be refactored into conditions for `isDirectRecursion` and `isIndirectRecursion`?
Expand Down Expand Up @@ -413,13 +405,19 @@ const SecondPassVisitor = {
// tracked multiplexed nodes for this group name or number to see if there's a node being
// replaced by this capture
const multiplex = multiplexNodes[i];
const mpName = multiplex.node.name;
if (
// This group is from subroutine expansion, and there's a multiplex value from either the
// origin node or a prior subroutine expansion group with the same origin
(origin === multiplex.node || (origin && origin === multiplex.origin)) ||
// This group is not from subroutine expansion, and it comes after a subroutine expansion
// group that refers to itself
node === multiplex.origin
// group that refers to this group
node === multiplex.origin ||
// The multiplex node is a named group that's not in the current alternation path (which
// will mean it's nonparticipating for any following backrefs); remove it from
// multiplexing since backrefs to nonparticipating groups can't match in Onig but match
// the empty string in JS
(mpName && !getOrCreate(namedGroupsInScopeByAlt, parentAlt, new Map()).has(mpName))
) {
multiplexNodes.splice(i, 1);
break;
Expand All @@ -432,20 +430,19 @@ const SecondPassVisitor = {
// nested groups), so if using a duplicate name for this alternation path, remove the name from
// all but the latest instance (also applies to groups added via subroutine expansion)
if (name) {
let parentAlt = getParentAlternative(node);
const namedGroupsInScope = getOrCreate(namedGroupsInScopeByAlt, parentAlt, new Map());
if (namedGroupsInScope.has(name)) {
// Change the earlier instance of this group name to an unnamed capturing group
// Will change the earlier instance of this group name to an unnamed capturing group
groupsWithDuplicateNamesToRemove.add(namedGroupsInScope.get(name));
}
// Track the latest instance of this group name, and pass it up through parent alternatives
namedGroupsInScope.set(name, node);
// Skip the immediate parent alt because we don't want subsequent sibling alts to consider
// named groups from their preceding siblings
parentAlt = getParentAlternative(parentAlt);
if (parentAlt) {
while ((parentAlt = getParentAlternative(parentAlt))) {
getOrCreate(namedGroupsInScopeByAlt, parentAlt, new Map()).set(name, node);
let upAlt = getParentAlternative(parentAlt);
if (upAlt) {
while ((upAlt = getParentAlternative(upAlt))) {
getOrCreate(namedGroupsInScopeByAlt, upAlt, new Map()).set(name, node);
}
}
}
Expand Down Expand Up @@ -504,16 +501,24 @@ const ThirdPassVisitor = {

Backreference({node, replaceWith}, {reffedNodesByBackreference}) {
const refNodes = reffedNodesByBackreference.get(node);
const unclosedCaps = getAllParents(node, node => node.type === AstTypes.CapturingGroup);
// For the backref's `ref`, use `number` rather than `name` because group names might have been
// removed if they're duplicates within their alternation path, or they might be removed later
// by the generator (depending on options) if they're duplicates within the overall pattern.
// Backrefs must come after groups they ref, so reffed node `number`s are already recalculated
// by the generator (depending on target) if they're duplicates within the overall pattern.
// Backrefs must come after groups they ref, so reffed node `number`s are already recalculated.
// Also, convert backrefs to not-yet-closed groups to `(?!)`; they can't match in Onig but
// match the empty string in JS
if (refNodes.length > 1) {
const alts = refNodes.map(reffedGroupNode => adoptAndSwapKids(
createAlternative(),
[createBackreference(reffedGroupNode.number)]
[ unclosedCaps.some(cap => cap.number === reffedGroupNode.number) ?
createLookaround({negate: true}) :
createBackreference(reffedGroupNode.number)
]
));
replaceWith(adoptAndSwapKids(createGroup(), alts));
} else if (unclosedCaps.some(cap => cap.number === node.ref || cap.name === node.ref)) {
replaceWith(createLookaround({negate: true}));
} else {
node.ref = refNodes[0].number;
}
Expand Down

0 comments on commit a5c8f18

Please sign in to comment.