diff --git a/README.md b/README.md
index 144a632..d634935 100644
--- a/README.md
+++ b/README.md
@@ -326,7 +326,8 @@ Notice that nearly every feature below has at least subtle differences from Java
✅ |
✔ Allows 1 hex digit
- ✔ Error for 2 hex digits > 7F
+ ✔ Above 7F , is UTF-8 encoded byte (unlike JS)
+ ✔ Error for invalid encoded bytes
|
diff --git a/scripts/onig-compare.js b/scripts/onig-compare.js
index 9d8a446..5943759 100644
--- a/scripts/onig-compare.js
+++ b/scripts/onig-compare.js
@@ -72,6 +72,8 @@ compare([
[r`\x{13FFFF}`, ``, r`Beyond Unicode range: JS doesn't support`],
[r`\x{140000}`, ``],
[r`\x{0 1}`, `\u{0}\u{1}`],
+ [r`\💖`, '💖'],
+ [`\\\u{10000}`, '\u{10000}'],
]);
async function compare(tests) {
diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js
index 554b2b4..6a8e709 100644
--- a/spec/match-char-class.spec.js
+++ b/spec/match-char-class.spec.js
@@ -6,6 +6,7 @@ beforeEach(() => {
});
describe('CharacterClass', () => {
+ // TODO: Move to `match-char.spec.js`?
describe('Character', () => {
describe('escape', () => {
it('should match supported letter escapes', () => {
diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js
index 86faf5b..7c0eabd 100644
--- a/spec/match-char.spec.js
+++ b/spec/match-char.spec.js
@@ -75,7 +75,7 @@ describe('Character', () => {
});
describe('identity escape', () => {
- it('should match identity escapes', () => {
+ it('should match BMP identity escapes', () => {
const baseNonmetachars = [
'\0', '!', '~', ' ', '\n', 'E', 'm', '£', '\uFFFF',
];
@@ -84,12 +84,12 @@ describe('Character', () => {
}
});
- it('should throw for multibyte escapes', () => {
- const multibyte = [
+ it('should match astral identity escapes', () => {
+ const astral = [
'💖', '\u{10000}', '\u{10FFFF}',
];
- for (const char of multibyte) {
- expect(() => compile(`\\${char}`)).toThrow();
+ for (const char of astral) {
+ expect(char).toExactlyMatch(`\\${char}`);
}
});
});
@@ -160,23 +160,32 @@ describe('Character', () => {
expect('\u{A}').toExactlyMatch(r`\xa`);
});
- it(r`should match hex char code with \xNN`, () => {
+ it(r`should match hex char code with \xNN up to 7F`, () => {
expect('\u{1}').toExactlyMatch(r`\x01`);
expect('\u{1}1').toExactlyMatch(r`\x011`);
expect('\u{A}').toExactlyMatch(r`\x0A`);
expect('\u{A}').toExactlyMatch(r`\x0a`);
+ expect('\u{7F}').toExactlyMatch(r`\x7F`);
});
- it(r`should throw for incomplete \x`, () => {
- expect(() => compile(r`\x`)).toThrow();
- expect(() => compile(r`\x.`)).toThrow();
- expect(() => compile(r`[\x]`)).toThrow();
+ it(r`should match hex char code UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
+ expect('\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC`); // €
+ expect('\u{20AC}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\xE2\x82\xAC`); // €€
+ expect('\u{20AC}\u{7F}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\x7F\xE2\x82\xAC`); // €€
+ expect('\u{9A69}').toExactlyMatch(r`\xE9\xA9\xA9`); // 驩
+ expect('\u{FEFF}').toExactlyMatch(r`\xEF\xBB\xBF`); // ZWNBSP/BOM
});
- it(r`should throw for multibyte \xNN (above 7F)`, () => {
- expect(() => compile(r`\x7F`)).not.toThrow();
+ it(r`should throw for invalid UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
expect(() => compile(r`\x80`)).toThrow();
expect(() => compile(r`\xFF`)).toThrow();
+ expect(() => compile(r`\xEF\xC0\xBB`)).toThrow();
+ });
+
+ it(r`should throw for incomplete \x`, () => {
+ expect(() => compile(r`\x`)).toThrow();
+ expect(() => compile(r`\x.`)).toThrow();
+ expect(() => compile(r`[\x]`)).toThrow();
});
it(r`should match hex char code with \uNNNN`, () => {
diff --git a/src/generate.js b/src/generate.js
index e5897a6..5d7f9dd 100644
--- a/src/generate.js
+++ b/src/generate.js
@@ -193,6 +193,7 @@ const CharCodeEscapeMap = new Map([
[13, r`\r`], // carriage return
[0x2028, r`\u2028`], // line separator
[0x2029, r`\u2029`], // paragraph separator
+ [0xFEFF, r`\uFEFF`], // ZWNBSP/BOM
]);
const casedRe = /^\p{Cased}$/u;
diff --git a/src/tokenize.js b/src/tokenize.js
index e46e6d8..eb6dcd7 100644
--- a/src/tokenize.js
+++ b/src/tokenize.js
@@ -61,6 +61,7 @@ const EscapeCharCodes = new Map([
const controlCharPattern = 'c.? | C(?:-.?)?';
// Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid
const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`;
+const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`;
const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`;
const escapedNumPattern = r`\d{1,3}`;
const charClassOpenPattern = r`\[\^?\]?`;
@@ -71,6 +72,7 @@ const tokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
+ | ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
| [gk]<[^>]*>
@@ -93,6 +95,7 @@ const charClassTokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
+ | ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
| .
@@ -252,9 +255,8 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
};
}
// Run last since it assumes an identity escape as final condition
- return {
- token: createTokenForSharedEscape(m, {inCharClass: false}),
- };
+ const result = createTokenForSharedEscape(m, {inCharClass: false});
+ return Array.isArray(result) ? {tokens: result} : {token: result};
}
if (m0 === '(') {
// Comment group
@@ -405,7 +407,12 @@ function getAllTokensForCharClass(pattern, opener, lastIndex) {
break;
}
} else {
- tokens.push(createTokenForAnyTokenWithinCharClass(m));
+ const result = createTokenForAnyTokenWithinCharClass(m);
+ if (Array.isArray(result)) {
+ tokens.push(...result);
+ } else {
+ tokens.push(result);
+ }
}
}
return {
@@ -459,6 +466,27 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
}
return createTokenForUnicodeProperty(raw);
}
+ // UTF-8 encoded byte sequence
+ if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) {
+ try {
+ const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16));
+ const decoded = new TextDecoder('utf-8', {
+ ignoreBOM: true,
+ fatal: true,
+ }).decode(new Uint8Array(bytes));
+ const encoder = new TextEncoder();
+ const tokens = [...decoded].map(char => {
+ // Might have different casing for hex A-F than the input
+ const raw = [...encoder.encode(char)].map(byte => `\\x${byte.toString(16)}`).join('');
+ return createToken(TokenTypes.Character, raw, {
+ value: char.codePointAt(0),
+ });
+ });
+ return tokens;
+ } catch (err) {
+ throw new Error(`Too short or invalid multibyte code "${raw}"`);
+ }
+ }
if (char1 === 'u' || char1 === 'x') {
return createToken(TokenTypes.Character, raw, {
value: getValidatedHexCharCode(raw),
@@ -484,13 +512,13 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
// [TODO] Supportable; see ,
throw new Error(`Unsupported meta "${raw}"`);
}
- // Identity escape; count code unit length
- if (raw.length === 2) {
+ // Identity escape; count code point length
+ if ([...raw].length === 2) {
return createToken(TokenTypes.Character, raw, {
value: raw.codePointAt(1),
});
}
- throw new Error(`Invalid multibyte escape "${raw}"`);
+ throw new Error(`Unexpected escape "${raw}"`);
}
/**