Multibyte escapes

slevithan · Nov 3, 2024 · b6da11e · b6da11e
1 parent 696dcab
commit b6da11e
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -326,7 +326,8 @@ Notice that nearly every feature below has at least subtle differences from Java
     <td align="middle">✅</td>
     <td>
       ✔ Allows 1 hex digit<br>
-      ✔ Error for 2 hex digits > <code>7F</code><br>
+      ✔ Above <code>7F</code>, is UTF-8 encoded byte (unlike JS)<br>
+      ✔ Error for invalid encoded bytes<br>
     </td>
   </tr>
   <tr valign="top">

diff --git a/scripts/onig-compare.js b/scripts/onig-compare.js
@@ -72,6 +72,8 @@ compare([
   [r`\x{13FFFF}`, ``, r`Beyond Unicode range: JS doesn't support`],
   [r`\x{140000}`, ``],
   [r`\x{0 1}`, `\u{0}\u{1}`],
+  [r`\💖`, '💖'],
+  [`\\\u{10000}`, '\u{10000}'],
 ]);
 
 async function compare(tests) {

diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js
@@ -6,6 +6,7 @@ beforeEach(() => {
 });
 
 describe('CharacterClass', () => {
+  // TODO: Move to `match-char.spec.js`?
   describe('Character', () => {
     describe('escape', () => {
       it('should match supported letter escapes', () => {

diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js
@@ -75,7 +75,7 @@ describe('Character', () => {
   });
 
   describe('identity escape', () => {
-    it('should match identity escapes', () => {
+    it('should match BMP identity escapes', () => {
       const baseNonmetachars = [
         '\0', '!', '~', ' ', '\n', 'E', 'm', '£', '\uFFFF',
       ];
@@ -84,12 +84,12 @@ describe('Character', () => {
       }
     });
 
-    it('should throw for multibyte escapes', () => {
-      const multibyte = [
+    it('should match astral identity escapes', () => {
+      const astral = [
         '💖', '\u{10000}', '\u{10FFFF}',
       ];
-      for (const char of multibyte) {
-        expect(() => compile(`\\${char}`)).toThrow();
+      for (const char of astral) {
+        expect(char).toExactlyMatch(`\\${char}`);
       }
     });
   });
@@ -160,23 +160,32 @@ describe('Character', () => {
       expect('\u{A}').toExactlyMatch(r`\xa`);
     });
 
-    it(r`should match hex char code with \xNN`, () => {
+    it(r`should match hex char code with \xNN up to 7F`, () => {
       expect('\u{1}').toExactlyMatch(r`\x01`);
       expect('\u{1}1').toExactlyMatch(r`\x011`);
       expect('\u{A}').toExactlyMatch(r`\x0A`);
       expect('\u{A}').toExactlyMatch(r`\x0a`);
+      expect('\u{7F}').toExactlyMatch(r`\x7F`);
     });
 
-    it(r`should throw for incomplete \x`, () => {
-      expect(() => compile(r`\x`)).toThrow();
-      expect(() => compile(r`\x.`)).toThrow();
-      expect(() => compile(r`[\x]`)).toThrow();
+    it(r`should match hex char code UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
+      expect('\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC`); // €
+      expect('\u{20AC}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\xE2\x82\xAC`); // €€
+      expect('\u{20AC}\u{7F}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\x7F\xE2\x82\xAC`); // €€
+      expect('\u{9A69}').toExactlyMatch(r`\xE9\xA9\xA9`); // 驩
+      expect('\u{FEFF}').toExactlyMatch(r`\xEF\xBB\xBF`); // ZWNBSP/BOM
     });
 
-    it(r`should throw for multibyte \xNN (above 7F)`, () => {
-      expect(() => compile(r`\x7F`)).not.toThrow();
+    it(r`should throw for invalid UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
       expect(() => compile(r`\x80`)).toThrow();
       expect(() => compile(r`\xFF`)).toThrow();
+      expect(() => compile(r`\xEF\xC0\xBB`)).toThrow();
+    });
+
+    it(r`should throw for incomplete \x`, () => {
+      expect(() => compile(r`\x`)).toThrow();
+      expect(() => compile(r`\x.`)).toThrow();
+      expect(() => compile(r`[\x]`)).toThrow();
     });
 
     it(r`should match hex char code with \uNNNN`, () => {

diff --git a/src/generate.js b/src/generate.js
@@ -193,6 +193,7 @@ const CharCodeEscapeMap = new Map([
   [13, r`\r`], // carriage return
   [0x2028, r`\u2028`], // line separator
   [0x2029, r`\u2029`], // paragraph separator
+  [0xFEFF, r`\uFEFF`], // ZWNBSP/BOM
 ]);
 
 const casedRe = /^\p{Cased}$/u;

diff --git a/src/tokenize.js b/src/tokenize.js
@@ -61,6 +61,7 @@ const EscapeCharCodes = new Map([
 const controlCharPattern = 'c.? | C(?:-.?)?';
 // Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid
 const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`;
+const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`;
 const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`;
 const escapedNumPattern = r`\d{1,3}`;
 const charClassOpenPattern = r`\[\^?\]?`;
@@ -71,6 +72,7 @@ const tokenRe = new RegExp(r`
   \\ (?:
     ${controlCharPattern}
     | ${unicodePropertyPattern}
+    | ${encodedByteValuePattern}
     | ${hexCharPattern}
     | ${escapedNumPattern}
     | [gk]<[^>]*>
@@ -93,6 +95,7 @@ const charClassTokenRe = new RegExp(r`
   \\ (?:
     ${controlCharPattern}
     | ${unicodePropertyPattern}
+    | ${encodedByteValuePattern}
     | ${hexCharPattern}
     | ${escapedNumPattern}
     | .
@@ -252,9 +255,8 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
       };
     }
     // Run last since it assumes an identity escape as final condition
-    return {
-      token: createTokenForSharedEscape(m, {inCharClass: false}),
-    };
+    const result = createTokenForSharedEscape(m, {inCharClass: false});
+    return Array.isArray(result) ? {tokens: result} : {token: result};
   }
   if (m0 === '(') {
     // Comment group
@@ -405,7 +407,12 @@ function getAllTokensForCharClass(pattern, opener, lastIndex) {
         break;
       }
     } else {
-      tokens.push(createTokenForAnyTokenWithinCharClass(m));
+      const result = createTokenForAnyTokenWithinCharClass(m);
+      if (Array.isArray(result)) {
+        tokens.push(...result);
+      } else {
+        tokens.push(result);
+      }
     }
   }
   return {
@@ -459,6 +466,27 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
     }
     return createTokenForUnicodeProperty(raw);
   }
+  // UTF-8 encoded byte sequence
+  if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) {
+    try {
+      const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16));
+      const decoded = new TextDecoder('utf-8', {
+        ignoreBOM: true,
+        fatal: true,
+      }).decode(new Uint8Array(bytes));
+      const encoder = new TextEncoder();
+      const tokens = [...decoded].map(char => {
+        // Might have different casing for hex A-F than the input
+        const raw = [...encoder.encode(char)].map(byte => `\\x${byte.toString(16)}`).join('');
+        return createToken(TokenTypes.Character, raw, {
+          value: char.codePointAt(0),
+        });
+      });
+      return tokens;
+    } catch (err) {
+      throw new Error(`Too short or invalid multibyte code "${raw}"`);
+    }
+  }
   if (char1 === 'u' || char1 === 'x') {
     return createToken(TokenTypes.Character, raw, {
       value: getValidatedHexCharCode(raw),
@@ -484,13 +512,13 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
     // [TODO] Supportable; see <https://github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#12-onig_syn_op2_esc_capital_m_bar_meta-enable-m-x>, <https://github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>
     throw new Error(`Unsupported meta "${raw}"`);
   }
-  // Identity escape; count code unit length
-  if (raw.length === 2) {
+  // Identity escape; count code point length
+  if ([...raw].length === 2) {
     return createToken(TokenTypes.Character, raw, {
       value: raw.codePointAt(1),
     });
   }
-  throw new Error(`Invalid multibyte escape "${raw}"`);
+  throw new Error(`Unexpected escape "${raw}"`);
 }
 
 /**