Skip to content

Commit

Permalink
Merge pull request #3200 from kena0ki/issue3032
Browse files Browse the repository at this point in the history
Filter U+FEFF (BOM) when decoding input data
  • Loading branch information
jerch authored Jan 23, 2021
2 parents 4da5d55 + 3d941f2 commit ff74caf
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 8 deletions.
38 changes: 34 additions & 4 deletions src/common/input/TextDecoder.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ describe('text encodings', () => {
const decoder = new StringToUtf32();
const target = new Uint32Array(5);
for (let i = 0; i < 65536; ++i) {
// skip surrogate pairs
if (i >= 0xD800 && i <= 0xDFFF) {
// skip surrogate pairs and a BOM
if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
continue;
}
const length = decoder.decode(String.fromCharCode(i), target);
Expand All @@ -84,6 +84,14 @@ describe('text encodings', () => {
decoder.clear();
}
});

it('0xFEFF(BOM)', () => {
const decoder = new StringToUtf32();
const target = new Uint32Array(5);
const length = decoder.decode(String.fromCharCode(0xFEFF), target);
assert.equal(length, 0);
decoder.clear();
});
});

it('test strings', () => {
Expand Down Expand Up @@ -118,8 +126,8 @@ describe('text encodings', () => {
const decoder = new Utf8ToUtf32();
const target = new Uint32Array(5);
for (let i = 0; i < 65536; ++i) {
// skip surrogate pairs
if (i >= 0xD800 && i <= 0xDFFF) {
// skip surrogate pairs and a BOM
if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
continue;
}
const utf8Data = fromByteString(encode(String.fromCharCode(i)));
Expand All @@ -142,6 +150,15 @@ describe('text encodings', () => {
decoder.clear();
}
});

it('0xFEFF(BOM)', () => {
const decoder = new Utf8ToUtf32();
const target = new Uint32Array(5);
const utf8Data = fromByteString(encode(String.fromCharCode(0xFEFF)));
const length = decoder.decode(utf8Data, target);
assert.equal(length, 0);
decoder.clear();
});
});

it('test strings', () => {
Expand Down Expand Up @@ -215,6 +232,19 @@ describe('text encodings', () => {
}
assert(decoded, 'Ä€𝄞Ö𝄞€Ü𝄞€');
});

it('BOMs (3 byte sequences) - advance by 2', () => {
const decoder = new Utf8ToUtf32();
const target = new Uint32Array(5);
const utf8Data = fromByteString('\xef\xbb\xbf\xef\xbb\xbf');
let decoded = '';
for (let i = 0; i < utf8Data.length; i += 2) {
const written = decoder.decode(utf8Data.slice(i, i + 2), target);
decoded += toString(target, written);
}
assert.equal(decoded, '');
});

it('test break after 3 bytes - issue #2495', () => {
const decoder = new Utf8ToUtf32();
const target = new Uint32Array(5);
Expand Down
12 changes: 8 additions & 4 deletions src/common/input/TextDecoder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ export class StringToUtf32 {
}
continue;
}
if (code === 0xFEFF) {
// BOM
continue;
}
target[size++] = code;
}
return size;
Expand Down Expand Up @@ -188,8 +192,8 @@ export class Utf8ToUtf32 {
target[size++] = cp;
}
} else if (type === 3) {
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
// illegal codepoint
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF) || cp === 0xFEFF) {
// illegal codepoint or BOM
} else {
target[size++] = cp;
}
Expand Down Expand Up @@ -286,8 +290,8 @@ export class Utf8ToUtf32 {
continue;
}
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
// illegal codepoint, no i-- here
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint === 0xFEFF) {
// illegal codepoint or BOM, no i-- here
continue;
}
target[size++] = codepoint;
Expand Down

0 comments on commit ff74caf

Please sign in to comment.