diff --git a/README.md b/README.md
index 1985e52..47fe626 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
- Flags
+ Flags
Supported in top-level flags and pattern modifiers
@@ -303,13 +303,22 @@ Notice that nearly every feature below has at least subtle differences from Java
Currently supported only in top-level flags
+
+ Digit is ASCII
+ D
+ ✅
+ ✅
+
+ ✔ ASCII \d
, \p{Digit}
, [[:digit:]]
+
+
Word is ASCII
W
✅
✅
- ✔ ASCII \b
, \w
, [[:word:]]
, \p{Word}
+ ✔ ASCII \b
, \w
, \p{Word}
, [[:word:]]
@@ -438,13 +447,13 @@ Notice that nearly every feature below has at least subtle differences from Java
- Character sets
- Digit, word
- \d
, \w
, etc.
+ Character sets
+ Digit
+ \d
, \D
✅
✅
- ✔ Same as JS (ASCII)
+ ✔ Unicode by default (≠ JS)
@@ -465,6 +474,15 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ ASCII (≠ JS)
+
+ Word
+ \w
, \W
+ ✅
+ ✅
+
+ ✔ Unicode by default (≠ JS)
+
+
Dot
.
diff --git a/demo/demo.js b/demo/demo.js
index b50bdbd..1880522 100644
--- a/demo/demo.js
+++ b/demo/demo.js
@@ -10,6 +10,7 @@ const state = {
i: getValue('flag-i'),
m: getValue('flag-m'),
x: getValue('flag-x'),
+ D: getValue('flag-D'),
W: getValue('flag-W'),
},
opts: {
@@ -73,6 +74,8 @@ function showTranspiled() {
state.flags.m ? 'm' : ''
}${
state.flags.x ? 'x' : ''
+ }${
+ state.flags.D ? 'D' : ''
}${
state.flags.W ? 'W' : ''
}`,
diff --git a/demo/index.html b/demo/index.html
index 32a05ae..3fb2a36 100644
--- a/demo/index.html
+++ b/demo/index.html
@@ -35,6 +35,11 @@ Try it
x
Insignificant whitespace and comments
+
+
+ D
+ Digit is ASCII
+
W
diff --git a/src/parse.js b/src/parse.js
index 8d633be..f7d58af 100644
--- a/src/parse.js
+++ b/src/parse.js
@@ -542,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
return node;
}
-function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
+function createFlags({ignoreCase, dotAll, extended, digitIsAscii, wordIsAscii}) {
return {
type: AstTypes.Flags,
ignoreCase,
dotAll,
extended,
+ digitIsAscii,
wordIsAscii,
};
}
diff --git a/src/tokenize.js b/src/tokenize.js
index ebac3c3..1b1660d 100644
--- a/src/tokenize.js
+++ b/src/tokenize.js
@@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
- if (!/^[imxW]*$/.test(flags)) {
- throw new Error(`Flags "${flags}" unsupported`);
+ if (!/^[imxDW]*$/.test(flags)) {
+ throw new Error(`Flags "${flags}" includes unsupported value`);
}
const xStack = [flags.includes('x')];
const context = {
@@ -196,6 +196,8 @@ function tokenize(pattern, flags = '') {
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
extended: flags.includes('x'),
+ // Flag D is currently only supported as a top-level flag
+ digitIsAscii: flags.includes('D'),
// Flag W is currently only supported as a top-level flag
wordIsAscii: flags.includes('W'),
},
diff --git a/src/transform.js b/src/transform.js
index 9473f14..6733147 100644
--- a/src/transform.js
+++ b/src/transform.js
@@ -57,6 +57,7 @@ function transform(ast, options) {
// Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass
subroutineRefMap: new Map(),
supportedGNodes: new Set(),
+ digitIsAscii: ast.flags.digitIsAscii,
wordIsAscii: ast.flags.wordIsAscii,
};
traverse({node: ast}, firstPassState, FirstPassVisitor);
@@ -155,10 +156,12 @@ const FirstPassVisitor = {
subroutineRefMap.set(name ?? number, node);
},
- CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, wordIsAscii}) {
+ CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, wordIsAscii}) {
const {kind, negate, value} = node;
if (kind === AstCharacterSetKinds.any) {
replaceWith(createUnicodeProperty('Any'));
+ } else if (kind === AstCharacterSetKinds.digit && !digitIsAscii) {
+ replaceWith(createUnicodeProperty('Nd', {negate}));
} else if (kind === AstCharacterSetKinds.hex) {
replaceWith(createUnicodeProperty('AHex', {negate}));
} else if (kind === AstCharacterSetKinds.non_newline) {
@@ -178,6 +181,8 @@ const FirstPassVisitor = {
ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
}
replaceWith(parseFragment(`[${ascii}]`));
+ } else if (value === 'digit' && digitIsAscii) {
+ replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
} else if (value === 'word' && wordIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
} else {
@@ -228,6 +233,7 @@ const FirstPassVisitor = {
Flags({node, parent}) {
// Remove Onig flags that aren't available in JS
delete node.extended; // Flag x
+ delete node.digitIsAscii; // Flag D
delete node.wordIsAscii; // Flag W
Object.assign(node, {
// JS flag g; no Onig equiv
diff --git a/src/unicode.js b/src/unicode.js
index 2c44fda..f9f95cf 100644
--- a/src/unicode.js
+++ b/src/unicode.js
@@ -5,6 +5,7 @@ const CharsWithoutIgnoreCaseExpansion = new Set([
cp(0x131), // ı
]);
+// Different than `PosixClassesMap`'s `word`
const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;
function getIgnoreCaseMatchChars(char) {
@@ -241,14 +242,15 @@ const PosixProperties = new Set([
'word',
'xdigit',
// The following are available with the same name in JS (see `JsUnicodeProperties`)
- // - alpha (JS: Alpha)
- // - ascii (JS: ASCII)
- // - cntrl (JS: cntrl)
- // - digit (JS: digit)
- // - lower (JS: Lower)
- // - punct (JS: punct)
- // - space (JS: space)
- // - upper (JS: Upper)
+ // Explicitly include `digit` for the sake of flag D (`digitIsAscii`) handling as POSIX
+ 'digit', // (JS: digit)
+ // 'alpha', // (JS: Alpha)
+ // 'ascii', // (JS: ASCII)
+ // 'cntrl', // (JS: cntrl)
+ // 'lower', // (JS: Lower)
+ // 'punct', // (JS: punct)
+ // 'space', // (JS: space)
+ // 'upper', // (JS: Upper)
]);
function range(start, end) {