Add flag D; use Unicode \d by default

slevithan · Nov 21, 2024 · db1d8bb · db1d8bb
1 parent e0bae15
commit db1d8bb
Show file tree

Hide file tree

Showing 7 changed files with 55 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
   </tr>
 
   <tr valign="top">
-    <th align="left" rowspan="6">Flags</th>
+    <th align="left" rowspan="7">Flags</th>
     <td colspan="5"><i>Supported in top-level flags and pattern modifiers</i></td>
   </tr>
   <tr valign="top">
@@ -303,13 +303,22 @@ Notice that nearly every feature below has at least subtle differences from Java
   <tr valign="top">
     <td colspan="5"><i>Currently supported only in top-level flags</i></td>
   </tr>
+  <tr valign="top">
+    <td>Digit is ASCII</td>
+    <td><code>D</code></td>
+    <td align="middle">✅</td>
+    <td align="middle">✅</td>
+    <td>
+      ✔ ASCII <code>\d</code>, <code>\p{Digit}</code>, <code>[[:digit:]]</code><br>
+    </td>
+  </tr>
   <tr valign="top">
     <td>Word is ASCII</td>
     <td><code>W</code></td>
     <td align="middle">✅</td>
     <td align="middle">✅</td>
     <td>
-      ✔ ASCII <code>\b</code>, <code>\w</code>, <code>[[:word:]]</code>, <code>\p{Word}</code><br>
+      ✔ ASCII <code>\b</code>, <code>\w</code>, <code>\p{Word}</code>, <code>[[:word:]]</code><br>
     </td>
   </tr>
 
@@ -438,13 +447,13 @@ Notice that nearly every feature below has at least subtle differences from Java
   </tr>
 
   <tr valign="top">
-    <th align="left" rowspan="7">Character sets</th>
-    <td>Digit, word</td>
-    <td><code>\d</code>, <code>\w</code>, etc.</td>
+    <th align="left" rowspan="8">Character sets</th>
+    <td>Digit</td>
+    <td><code>\d</code>, <code>\D</code></td>
     <td align="middle">✅</td>
     <td align="middle">✅</td>
     <td>
-      ✔ Same as JS (ASCII)<br>
+      ✔ Unicode by default (≠ JS)<br>
     </td>
   </tr>
   <tr valign="top">
@@ -465,6 +474,15 @@ Notice that nearly every feature below has at least subtle differences from Java
       ✔ ASCII (≠ JS)<br>
     </td>
   </tr>
+  <tr valign="top">
+    <td>Word</td>
+    <td><code>\w</code>, <code>\W</code></td>
+    <td align="middle">✅</td>
+    <td align="middle">✅</td>
+    <td>
+      ✔ Unicode by default (≠ JS)<br>
+    </td>
+  </tr>
   <tr valign="top">
     <td>Dot</td>
     <td><code>.</code></td>

diff --git a/demo/demo.js b/demo/demo.js
@@ -10,6 +10,7 @@ const state = {
     i: getValue('flag-i'),
     m: getValue('flag-m'),
     x: getValue('flag-x'),
+    D: getValue('flag-D'),
     W: getValue('flag-W'),
   },
   opts: {
@@ -73,6 +74,8 @@ function showTranspiled() {
       state.flags.m ? 'm' : ''
     }${
       state.flags.x ? 'x' : ''
+    }${
+      state.flags.D ? 'D' : ''
     }${
       state.flags.W ? 'W' : ''
     }`,

diff --git a/demo/index.html b/demo/index.html
@@ -35,6 +35,11 @@ <h2>Try it</h2>
         <kbd>x</kbd>
         <span class="tip tip-lg">Insignificant whitespace and comments</span>
       </label>
+      <label>
+        <input type="checkbox" id="flag-D" onchange="setFlag('D', this.checked)">
+        <kbd>D</kbd>
+        <span class="tip tip-sm">Digit is ASCII</span>
+      </label>
       <label>
         <input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
         <kbd>W</kbd>

diff --git a/src/parse.js b/src/parse.js
@@ -542,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
   return node;
 }
 
-function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
+function createFlags({ignoreCase, dotAll, extended, digitIsAscii, wordIsAscii}) {
   return {
     type: AstTypes.Flags,
     ignoreCase,
     dotAll,
     extended,
+    digitIsAscii,
     wordIsAscii,
   };
 }

diff --git a/src/tokenize.js b/src/tokenize.js
@@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
   if (typeof pattern !== 'string') {
     throw new Error('String expected as pattern');
   }
-  if (!/^[imxW]*$/.test(flags)) {
-    throw new Error(`Flags "${flags}" unsupported`);
+  if (!/^[imxDW]*$/.test(flags)) {
+    throw new Error(`Flags "${flags}" includes unsupported value`);
   }
   const xStack = [flags.includes('x')];
   const context = {
@@ -196,6 +196,8 @@ function tokenize(pattern, flags = '') {
       dotAll: flags.includes('m'),
       // Flag x is fully handled during tokenization
       extended: flags.includes('x'),
+      // Flag D is currently only supported as a top-level flag
+      digitIsAscii: flags.includes('D'),
       // Flag W is currently only supported as a top-level flag
       wordIsAscii: flags.includes('W'),
     },

diff --git a/src/transform.js b/src/transform.js
@@ -57,6 +57,7 @@ function transform(ast, options) {
     // Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass 
     subroutineRefMap: new Map(),
     supportedGNodes: new Set(),
+    digitIsAscii: ast.flags.digitIsAscii,
     wordIsAscii: ast.flags.wordIsAscii,
   };
   traverse({node: ast}, firstPassState, FirstPassVisitor);
@@ -155,10 +156,12 @@ const FirstPassVisitor = {
     subroutineRefMap.set(name ?? number, node);
   },
 
-  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, wordIsAscii}) {
+  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, wordIsAscii}) {
     const {kind, negate, value} = node;
     if (kind === AstCharacterSetKinds.any) {
       replaceWith(createUnicodeProperty('Any'));
+    } else if (kind === AstCharacterSetKinds.digit && !digitIsAscii) {
+      replaceWith(createUnicodeProperty('Nd', {negate}));
     } else if (kind === AstCharacterSetKinds.hex) {
       replaceWith(createUnicodeProperty('AHex', {negate}));
     } else if (kind === AstCharacterSetKinds.non_newline) {
@@ -178,6 +181,8 @@ const FirstPassVisitor = {
           ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
         }
         replaceWith(parseFragment(`[${ascii}]`));
+      } else if (value === 'digit' && digitIsAscii) {
+        replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
       } else if (value === 'word' && wordIsAscii) {
         replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
       } else {
@@ -228,6 +233,7 @@ const FirstPassVisitor = {
   Flags({node, parent}) {
     // Remove Onig flags that aren't available in JS
     delete node.extended; // Flag x
+    delete node.digitIsAscii; // Flag D
     delete node.wordIsAscii; // Flag W
     Object.assign(node, {
       // JS flag g; no Onig equiv

diff --git a/src/unicode.js b/src/unicode.js
@@ -5,6 +5,7 @@ const CharsWithoutIgnoreCaseExpansion = new Set([
   cp(0x131), // ı
 ]);
 
+// Different than `PosixClassesMap`'s `word`
 const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;
 
 function getIgnoreCaseMatchChars(char) {
@@ -241,14 +242,15 @@ const PosixProperties = new Set([
   'word',
   'xdigit',
   // The following are available with the same name in JS (see `JsUnicodeProperties`)
-  // - alpha (JS: Alpha)
-  // - ascii (JS: ASCII)
-  // - cntrl (JS: cntrl)
-  // - digit (JS: digit)
-  // - lower (JS: Lower)
-  // - punct (JS: punct)
-  // - space (JS: space)
-  // - upper (JS: Upper)
+  // Explicitly include `digit` for the sake of flag D (`digitIsAscii`) handling as POSIX
+  'digit', // (JS: digit)
+  // 'alpha', // (JS: Alpha)
+  // 'ascii', // (JS: ASCII)
+  // 'cntrl', // (JS: cntrl)
+  // 'lower', // (JS: Lower)
+  // 'punct', // (JS: punct)
+  // 'space', // (JS: space)
+  // 'upper', // (JS: Upper)
 ]);
 
 function range(start, end) {