diff --git a/README.md b/README.md index 144a632..d634935 100644 --- a/README.md +++ b/README.md @@ -326,7 +326,8 @@ Notice that nearly every feature below has at least subtle differences from Java ✅ ✔ Allows 1 hex digit
- ✔ Error for 2 hex digits > 7F
+ ✔ Above 7F, is UTF-8 encoded byte (unlike JS)
+ ✔ Error for invalid encoded bytes
diff --git a/scripts/onig-compare.js b/scripts/onig-compare.js index 9d8a446..5943759 100644 --- a/scripts/onig-compare.js +++ b/scripts/onig-compare.js @@ -72,6 +72,8 @@ compare([ [r`\x{13FFFF}`, ``, r`Beyond Unicode range: JS doesn't support`], [r`\x{140000}`, ``], [r`\x{0 1}`, `\u{0}\u{1}`], + [r`\💖`, '💖'], + [`\\\u{10000}`, '\u{10000}'], ]); async function compare(tests) { diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js index 554b2b4..6a8e709 100644 --- a/spec/match-char-class.spec.js +++ b/spec/match-char-class.spec.js @@ -6,6 +6,7 @@ beforeEach(() => { }); describe('CharacterClass', () => { + // TODO: Move to `match-char.spec.js`? describe('Character', () => { describe('escape', () => { it('should match supported letter escapes', () => { diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js index 86faf5b..7c0eabd 100644 --- a/spec/match-char.spec.js +++ b/spec/match-char.spec.js @@ -75,7 +75,7 @@ describe('Character', () => { }); describe('identity escape', () => { - it('should match identity escapes', () => { + it('should match BMP identity escapes', () => { const baseNonmetachars = [ '\0', '!', '~', ' ', '\n', 'E', 'm', '£', '\uFFFF', ]; @@ -84,12 +84,12 @@ describe('Character', () => { } }); - it('should throw for multibyte escapes', () => { - const multibyte = [ + it('should match astral identity escapes', () => { + const astral = [ '💖', '\u{10000}', '\u{10FFFF}', ]; - for (const char of multibyte) { - expect(() => compile(`\\${char}`)).toThrow(); + for (const char of astral) { + expect(char).toExactlyMatch(`\\${char}`); } }); }); @@ -160,23 +160,32 @@ describe('Character', () => { expect('\u{A}').toExactlyMatch(r`\xa`); }); - it(r`should match hex char code with \xNN`, () => { + it(r`should match hex char code with \xNN up to 7F`, () => { expect('\u{1}').toExactlyMatch(r`\x01`); expect('\u{1}1').toExactlyMatch(r`\x011`); expect('\u{A}').toExactlyMatch(r`\x0A`); expect('\u{A}').toExactlyMatch(r`\x0a`); + expect('\u{7F}').toExactlyMatch(r`\x7F`); }); - it(r`should throw for incomplete \x`, () => { - expect(() => compile(r`\x`)).toThrow(); - expect(() => compile(r`\x.`)).toThrow(); - expect(() => compile(r`[\x]`)).toThrow(); + it(r`should match hex char code UTF-8 encoded byte sequences \xNN (above 7F)`, () => { + expect('\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC`); // € + expect('\u{20AC}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\xE2\x82\xAC`); // €€ + expect('\u{20AC}\u{7F}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\x7F\xE2\x82\xAC`); // €€ + expect('\u{9A69}').toExactlyMatch(r`\xE9\xA9\xA9`); // 驩 + expect('\u{FEFF}').toExactlyMatch(r`\xEF\xBB\xBF`); // ZWNBSP/BOM }); - it(r`should throw for multibyte \xNN (above 7F)`, () => { - expect(() => compile(r`\x7F`)).not.toThrow(); + it(r`should throw for invalid UTF-8 encoded byte sequences \xNN (above 7F)`, () => { expect(() => compile(r`\x80`)).toThrow(); expect(() => compile(r`\xFF`)).toThrow(); + expect(() => compile(r`\xEF\xC0\xBB`)).toThrow(); + }); + + it(r`should throw for incomplete \x`, () => { + expect(() => compile(r`\x`)).toThrow(); + expect(() => compile(r`\x.`)).toThrow(); + expect(() => compile(r`[\x]`)).toThrow(); }); it(r`should match hex char code with \uNNNN`, () => { diff --git a/src/generate.js b/src/generate.js index e5897a6..5d7f9dd 100644 --- a/src/generate.js +++ b/src/generate.js @@ -193,6 +193,7 @@ const CharCodeEscapeMap = new Map([ [13, r`\r`], // carriage return [0x2028, r`\u2028`], // line separator [0x2029, r`\u2029`], // paragraph separator + [0xFEFF, r`\uFEFF`], // ZWNBSP/BOM ]); const casedRe = /^\p{Cased}$/u; diff --git a/src/tokenize.js b/src/tokenize.js index e46e6d8..eb6dcd7 100644 --- a/src/tokenize.js +++ b/src/tokenize.js @@ -61,6 +61,7 @@ const EscapeCharCodes = new Map([ const controlCharPattern = 'c.? | C(?:-.?)?'; // Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`; +const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`; const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`; const escapedNumPattern = r`\d{1,3}`; const charClassOpenPattern = r`\[\^?\]?`; @@ -71,6 +72,7 @@ const tokenRe = new RegExp(r` \\ (?: ${controlCharPattern} | ${unicodePropertyPattern} + | ${encodedByteValuePattern} | ${hexCharPattern} | ${escapedNumPattern} | [gk]<[^>]*> @@ -93,6 +95,7 @@ const charClassTokenRe = new RegExp(r` \\ (?: ${controlCharPattern} | ${unicodePropertyPattern} + | ${encodedByteValuePattern} | ${hexCharPattern} | ${escapedNumPattern} | . @@ -252,9 +255,8 @@ function getTokenWithDetails(context, pattern, m, lastIndex) { }; } // Run last since it assumes an identity escape as final condition - return { - token: createTokenForSharedEscape(m, {inCharClass: false}), - }; + const result = createTokenForSharedEscape(m, {inCharClass: false}); + return Array.isArray(result) ? {tokens: result} : {token: result}; } if (m0 === '(') { // Comment group @@ -405,7 +407,12 @@ function getAllTokensForCharClass(pattern, opener, lastIndex) { break; } } else { - tokens.push(createTokenForAnyTokenWithinCharClass(m)); + const result = createTokenForAnyTokenWithinCharClass(m); + if (Array.isArray(result)) { + tokens.push(...result); + } else { + tokens.push(result); + } } } return { @@ -459,6 +466,27 @@ function createTokenForSharedEscape(raw, {inCharClass}) { } return createTokenForUnicodeProperty(raw); } + // UTF-8 encoded byte sequence + if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) { + try { + const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16)); + const decoded = new TextDecoder('utf-8', { + ignoreBOM: true, + fatal: true, + }).decode(new Uint8Array(bytes)); + const encoder = new TextEncoder(); + const tokens = [...decoded].map(char => { + // Might have different casing for hex A-F than the input + const raw = [...encoder.encode(char)].map(byte => `\\x${byte.toString(16)}`).join(''); + return createToken(TokenTypes.Character, raw, { + value: char.codePointAt(0), + }); + }); + return tokens; + } catch (err) { + throw new Error(`Too short or invalid multibyte code "${raw}"`); + } + } if (char1 === 'u' || char1 === 'x') { return createToken(TokenTypes.Character, raw, { value: getValidatedHexCharCode(raw), @@ -484,13 +512,13 @@ function createTokenForSharedEscape(raw, {inCharClass}) { // [TODO] Supportable; see , throw new Error(`Unsupported meta "${raw}"`); } - // Identity escape; count code unit length - if (raw.length === 2) { + // Identity escape; count code point length + if ([...raw].length === 2) { return createToken(TokenTypes.Character, raw, { value: raw.codePointAt(1), }); } - throw new Error(`Invalid multibyte escape "${raw}"`); + throw new Error(`Unexpected escape "${raw}"`); } /**