Skip to content

Commit

Permalink
Add flag D; use Unicode \d by default
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 21, 2024
1 parent e0bae15 commit db1d8bb
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 18 deletions.
30 changes: 24 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
</tr>

<tr valign="top">
<th align="left" rowspan="6">Flags</th>
<th align="left" rowspan="7">Flags</th>
<td colspan="5"><i>Supported in top-level flags and pattern modifiers</i></td>
</tr>
<tr valign="top">
Expand Down Expand Up @@ -303,13 +303,22 @@ Notice that nearly every feature below has at least subtle differences from Java
<tr valign="top">
<td colspan="5"><i>Currently supported only in top-level flags</i></td>
</tr>
<tr valign="top">
<td>Digit is ASCII</td>
<td><code>D</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ ASCII <code>\d</code>, <code>\p{Digit}</code>, <code>[[:digit:]]</code><br>
</td>
</tr>
<tr valign="top">
<td>Word is ASCII</td>
<td><code>W</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ ASCII <code>\b</code>, <code>\w</code>, <code>[[:word:]]</code>, <code>\p{Word}</code><br>
✔ ASCII <code>\b</code>, <code>\w</code>, <code>\p{Word}</code>, <code>[[:word:]]</code><br>
</td>
</tr>

Expand Down Expand Up @@ -438,13 +447,13 @@ Notice that nearly every feature below has at least subtle differences from Java
</tr>

<tr valign="top">
<th align="left" rowspan="7">Character sets</th>
<td>Digit, word</td>
<td><code>\d</code>, <code>\w</code>, etc.</td>
<th align="left" rowspan="8">Character sets</th>
<td>Digit</td>
<td><code>\d</code>, <code>\D</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
Same as JS (ASCII)<br>
Unicode by default (≠ JS)<br>
</td>
</tr>
<tr valign="top">
Expand All @@ -465,6 +474,15 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ ASCII (≠ JS)<br>
</td>
</tr>
<tr valign="top">
<td>Word</td>
<td><code>\w</code>, <code>\W</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ Unicode by default (≠ JS)<br>
</td>
</tr>
<tr valign="top">
<td>Dot</td>
<td><code>.</code></td>
Expand Down
3 changes: 3 additions & 0 deletions demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const state = {
i: getValue('flag-i'),
m: getValue('flag-m'),
x: getValue('flag-x'),
D: getValue('flag-D'),
W: getValue('flag-W'),
},
opts: {
Expand Down Expand Up @@ -73,6 +74,8 @@ function showTranspiled() {
state.flags.m ? 'm' : ''
}${
state.flags.x ? 'x' : ''
}${
state.flags.D ? 'D' : ''
}${
state.flags.W ? 'W' : ''
}`,
Expand Down
5 changes: 5 additions & 0 deletions demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ <h2>Try it</h2>
<kbd>x</kbd>
<span class="tip tip-lg">Insignificant whitespace and comments</span>
</label>
<label>
<input type="checkbox" id="flag-D" onchange="setFlag('D', this.checked)">
<kbd>D</kbd>
<span class="tip tip-sm">Digit is ASCII</span>
</label>
<label>
<input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
<kbd>W</kbd>
Expand Down
3 changes: 2 additions & 1 deletion src/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -542,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
return node;
}

function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
function createFlags({ignoreCase, dotAll, extended, digitIsAscii, wordIsAscii}) {
return {
type: AstTypes.Flags,
ignoreCase,
dotAll,
extended,
digitIsAscii,
wordIsAscii,
};
}
Expand Down
6 changes: 4 additions & 2 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
if (!/^[imxW]*$/.test(flags)) {
throw new Error(`Flags "${flags}" unsupported`);
if (!/^[imxDW]*$/.test(flags)) {
throw new Error(`Flags "${flags}" includes unsupported value`);
}
const xStack = [flags.includes('x')];
const context = {
Expand Down Expand Up @@ -196,6 +196,8 @@ function tokenize(pattern, flags = '') {
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
extended: flags.includes('x'),
// Flag D is currently only supported as a top-level flag
digitIsAscii: flags.includes('D'),
// Flag W is currently only supported as a top-level flag
wordIsAscii: flags.includes('W'),
},
Expand Down
8 changes: 7 additions & 1 deletion src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ function transform(ast, options) {
// Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass
subroutineRefMap: new Map(),
supportedGNodes: new Set(),
digitIsAscii: ast.flags.digitIsAscii,
wordIsAscii: ast.flags.wordIsAscii,
};
traverse({node: ast}, firstPassState, FirstPassVisitor);
Expand Down Expand Up @@ -155,10 +156,12 @@ const FirstPassVisitor = {
subroutineRefMap.set(name ?? number, node);
},

CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, wordIsAscii}) {
CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, wordIsAscii}) {
const {kind, negate, value} = node;
if (kind === AstCharacterSetKinds.any) {
replaceWith(createUnicodeProperty('Any'));
} else if (kind === AstCharacterSetKinds.digit && !digitIsAscii) {
replaceWith(createUnicodeProperty('Nd', {negate}));
} else if (kind === AstCharacterSetKinds.hex) {
replaceWith(createUnicodeProperty('AHex', {negate}));
} else if (kind === AstCharacterSetKinds.non_newline) {
Expand All @@ -178,6 +181,8 @@ const FirstPassVisitor = {
ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
}
replaceWith(parseFragment(`[${ascii}]`));
} else if (value === 'digit' && digitIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
} else if (value === 'word' && wordIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
} else {
Expand Down Expand Up @@ -228,6 +233,7 @@ const FirstPassVisitor = {
Flags({node, parent}) {
// Remove Onig flags that aren't available in JS
delete node.extended; // Flag x
delete node.digitIsAscii; // Flag D
delete node.wordIsAscii; // Flag W
Object.assign(node, {
// JS flag g; no Onig equiv
Expand Down
18 changes: 10 additions & 8 deletions src/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const CharsWithoutIgnoreCaseExpansion = new Set([
cp(0x131), // ı
]);

// Different than `PosixClassesMap`'s `word`
const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;

function getIgnoreCaseMatchChars(char) {
Expand Down Expand Up @@ -241,14 +242,15 @@ const PosixProperties = new Set([
'word',
'xdigit',
// The following are available with the same name in JS (see `JsUnicodeProperties`)
// - alpha (JS: Alpha)
// - ascii (JS: ASCII)
// - cntrl (JS: cntrl)
// - digit (JS: digit)
// - lower (JS: Lower)
// - punct (JS: punct)
// - space (JS: space)
// - upper (JS: Upper)
// Explicitly include `digit` for the sake of flag D (`digitIsAscii`) handling as POSIX
'digit', // (JS: digit)
// 'alpha', // (JS: Alpha)
// 'ascii', // (JS: ASCII)
// 'cntrl', // (JS: cntrl)
// 'lower', // (JS: Lower)
// 'punct', // (JS: punct)
// 'space', // (JS: space)
// 'upper', // (JS: Upper)
]);

function range(start, end) {
Expand Down

0 comments on commit db1d8bb

Please sign in to comment.