From 08283059ad359016fec14a495093490f3277d604 Mon Sep 17 00:00:00 2001 From: sttk Date: Sat, 9 Apr 2022 19:47:56 +0900 Subject: [PATCH 1/7] fix: Remove the dependency on remove-bom-buffer and use TextDecoder instead --- index.js | 10 ++++++++-- package.json | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/index.js b/index.js index 6047d4b..8dac9ab 100644 --- a/index.js +++ b/index.js @@ -1,7 +1,10 @@ 'use strict'; var through = require('through2'); -var removeBom = require('remove-bom-buffer'); +var isUTF8 = require('is-utf8'); +var TextDecoder = require('util').TextDecoder; + +var removeBom = new TextDecoder('utf-8', { ignoreBOM: false }); function removeBomStream() { var state = 0; // 0:Not removed, -1:In removing, 1:Already removed @@ -14,7 +17,10 @@ function removeBomStream() { buffer = null; - return removeBom(data); + if (isUTF8(data)) { + return removeBom.decode(data); + } + return data; } function onChunk(data, enc, cb) { diff --git a/package.json b/package.json index ca167de..7dc04ac 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,7 @@ "test": "nyc mocha --async-only" }, "dependencies": { - "remove-bom-buffer": "^3.0.0", + "is-utf8": "^0.2.1", "through2": "^4.0.2" }, "devDependencies": { From 52ce320187fa280f97a7321044f1e266b0b4d6eb Mon Sep 17 00:00:00 2001 From: sttk Date: Sat, 9 Apr 2022 19:57:25 +0900 Subject: [PATCH 2/7] update!: Remove is-utf8 and change to take encoding as an argument --- README.md | 6 +++--- index.js | 8 +++++--- package.json | 1 - test/index.js | 18 +++++++++--------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b9a6e4e..0c818fb 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ var concat = require('concat-stream'); var removeBOM = require('remove-bom-stream'); fs.createReadStream('utf8-file-with-bom.txt') - .pipe(removeBOM()) + .pipe(removeBOM('utf-8')) .pipe( concat(function (result) { // result won't have a BOM @@ -28,9 +28,9 @@ fs.createReadStream('utf8-file-with-bom.txt') ## API -### `removeBOM()` +### `removeBOM(encoding)` -Returns a `through2` stream that will remove a BOM, given the data is a UTF8 Buffer with a BOM at the beginning. If the data is not UTF8 or does not have a BOM, the data is not changed and this becomes a normal passthrough stream. +Returns a `through2` stream that will remove a BOM, if the argument `encoding` is `'utf-8'` and the given data is a UTF8 Buffer with a BOM at the beginning. If the `encoding` is not `'utf-8'` or does not have a BOM, the data is not changed and this becomes a normal passthrough stream. ## License diff --git a/index.js b/index.js index 8dac9ab..82f5418 100644 --- a/index.js +++ b/index.js @@ -1,12 +1,14 @@ 'use strict'; var through = require('through2'); -var isUTF8 = require('is-utf8'); var TextDecoder = require('util').TextDecoder; var removeBom = new TextDecoder('utf-8', { ignoreBOM: false }); -function removeBomStream() { +function removeBomStream(encoding) { + encoding = (encoding || '').toLowerCase(); + var isUtf8 = (encoding === 'utf-8' || encoding === 'utf8'); + var state = 0; // 0:Not removed, -1:In removing, 1:Already removed var buffer = Buffer.alloc(0); @@ -17,7 +19,7 @@ function removeBomStream() { buffer = null; - if (isUTF8(data)) { + if (isUtf8) { return removeBom.decode(data); } return data; diff --git a/package.json b/package.json index 7dc04ac..be77b04 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,6 @@ "test": "nyc mocha --async-only" }, "dependencies": { - "is-utf8": "^0.2.1", "through2": "^4.0.2" }, "devDependencies": { diff --git a/test/index.js b/test/index.js index 5789090..b8c9be7 100644 --- a/test/index.js +++ b/test/index.js @@ -24,7 +24,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('utf-8'), concat(assert)], done ); }); @@ -33,7 +33,7 @@ describe('removeBomStream', function () { var filepath = path.join(__dirname, './fixtures/test.txt'); var fileContent = fs.readFileSync(filepath, 'utf-8'); - var rmBom = removeBomStream(); + var rmBom = removeBomStream('utf8'); var output = ''; rmBom.on('data', function (d) { output += d.toString(); @@ -55,7 +55,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('UTF-8'), concat(assert)], done ); }); @@ -73,7 +73,7 @@ describe('removeBomStream', function () { [ fs.createReadStream(filepath), chunker(1), - removeBomStream(), + removeBomStream('UTF8'), concat(assert), ], done @@ -92,7 +92,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('UTF-8'), concat(assert)], done ); }); @@ -101,7 +101,7 @@ describe('removeBomStream', function () { var filepath = path.join(__dirname, './fixtures/bom-utf8.txt'); var fileContent = fs.readFileSync(filepath, 'utf-8'); - var rmBom = removeBomStream(); + var rmBom = removeBomStream('utf-8'); var output = ''; rmBom.on('data', function (d) { output += d.toString(); @@ -123,7 +123,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('utf-16be'), concat(assert)], done ); }); @@ -138,7 +138,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('utf-16be'), concat(assert)], done ); }); @@ -153,7 +153,7 @@ describe('removeBomStream', function () { } pipe( - [fs.createReadStream(filepath), removeBomStream(), concat(assert)], + [fs.createReadStream(filepath), removeBomStream('utf-16le'), concat(assert)], done ); }); From 9bde600300f22f78228533686e916e0b4c011e92 Mon Sep 17 00:00:00 2001 From: Blaine Bublitz Date: Mon, 11 Apr 2022 18:57:31 -0700 Subject: [PATCH 3/7] Use TextDecoder as a stream --- index.js | 61 +++++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/index.js b/index.js index 82f5418..7e3b7ab 100644 --- a/index.js +++ b/index.js @@ -3,57 +3,36 @@ var through = require('through2'); var TextDecoder = require('util').TextDecoder; -var removeBom = new TextDecoder('utf-8', { ignoreBOM: false }); - function removeBomStream(encoding) { encoding = (encoding || '').toLowerCase(); - var isUtf8 = (encoding === 'utf-8' || encoding === 'utf8'); - - var state = 0; // 0:Not removed, -1:In removing, 1:Already removed - var buffer = Buffer.alloc(0); - - return through(onChunk, onFlush); + var isUTF8 = (encoding === 'utf-8' || encoding === 'utf8'); - function removeAndCleanup(data) { - state = 1; // Already removed - - buffer = null; - - if (isUtf8) { - return removeBom.decode(data); - } - return data; - } + // Only used if encoding is UTF-8 + var decoder = new TextDecoder('utf-8', { ignoreBOM: false }); - function onChunk(data, enc, cb) { - if (state === 1) { - return cb(null, data); - } + var state = 0; // 0:Not removed, -1:In removing, 1:Already removed - if (state === 0 /* Not removed */ && data.length >= 7) { - return cb(null, removeAndCleanup(data)); - } + return through(onChunk); - state = -1; // In removing + function onChunk(data, _, cb) { + if (state === 1 || !isUTF8) { + cb(null, data); + } else { + try { + state = -1; - var bufferLength = buffer.length; - var chunkLength = data.length; - var totalLength = bufferLength + chunkLength; + var chunk = decoder.decode(data, { stream: true }); - buffer = Buffer.concat([buffer, data], totalLength); + // The first time we have data after a decode, it should have already removed the BOM + if (chunk !== '') { + state = 1 + } - if (totalLength >= 7) { - return cb(null, removeAndCleanup(buffer)); + cb(null, Buffer.from(chunk, encoding)); + } catch (err) { + cb(err); + } } - cb(); - } - - function onFlush(cb) { - if (state === 2 /* Already removed */ || !buffer) { - return cb(); - } - - cb(null, removeAndCleanup(buffer)); } } From 281535c79a65ad1020a35305bf8445644edaaa8f Mon Sep 17 00:00:00 2001 From: sttk Date: Sun, 17 Apr 2022 14:17:03 +0900 Subject: [PATCH 4/7] fix: treat a case that a BOM is not removed on Node <= v11 --- index.js | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/index.js b/index.js index 7e3b7ab..cd90b2e 100644 --- a/index.js +++ b/index.js @@ -3,10 +3,16 @@ var through = require('through2'); var TextDecoder = require('util').TextDecoder; +var BOM = Buffer.from([0xEF, 0xBB, 0xBF], 'utf-8'); + function removeBomStream(encoding) { encoding = (encoding || '').toLowerCase(); var isUTF8 = (encoding === 'utf-8' || encoding === 'utf8'); + if (!isUTF8) { + return through(); + } + // Only used if encoding is UTF-8 var decoder = new TextDecoder('utf-8', { ignoreBOM: false }); @@ -15,23 +21,33 @@ function removeBomStream(encoding) { return through(onChunk); function onChunk(data, _, cb) { - if (state === 1 || !isUTF8) { + if (state === 1) { cb(null, data); - } else { - try { - state = -1; + return; + } - var chunk = decoder.decode(data, { stream: true }); + try { + state = -1; - // The first time we have data after a decode, it should have already removed the BOM - if (chunk !== '') { - state = 1 - } + var chunk = decoder.decode(data, { stream: true }); - cb(null, Buffer.from(chunk, encoding)); - } catch (err) { - cb(err); + // The first time we have data after a decode, it should have already removed the BOM + if (chunk !== '') { + chunk += decoder.decode(); // end of stream mode and clear inner buffer. + + var buffer = Buffer.from(chunk, 'utf-8'); + + // Node<=v11, TextDecoder#decode returns a BOM if it receives a BOM separately. + if (BOM.compare(buffer) !== 0) { + state = 1; + cb(null, buffer); + return; + } } + + cb(); + } catch (err) { + cb(err); } } } From 91c082c08b9c5dc9696ea877a835b5e1ce7015db Mon Sep 17 00:00:00 2001 From: Blaine Bublitz Date: Mon, 18 Apr 2022 13:59:16 -0700 Subject: [PATCH 5/7] Update index.js --- index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index cd90b2e..05496dc 100644 --- a/index.js +++ b/index.js @@ -37,7 +37,8 @@ function removeBomStream(encoding) { var buffer = Buffer.from(chunk, 'utf-8'); - // Node<=v11, TextDecoder#decode returns a BOM if it receives a BOM separately. + // Node<=v12, TextDecoder#decode returns a BOM if it receives a BOM separately. + // Ref https://github.com/nodejs/node/pull/30132 if (BOM.compare(buffer) !== 0) { state = 1; cb(null, buffer); From d757f5341a2251a92d3f02f606da2e77a61bbb99 Mon Sep 17 00:00:00 2001 From: Blaine Bublitz Date: Mon, 18 Apr 2022 14:04:53 -0700 Subject: [PATCH 6/7] string bom --- index.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/index.js b/index.js index 05496dc..dab60b8 100644 --- a/index.js +++ b/index.js @@ -3,7 +3,7 @@ var through = require('through2'); var TextDecoder = require('util').TextDecoder; -var BOM = Buffer.from([0xEF, 0xBB, 0xBF], 'utf-8'); +var BOM = '\ufeff'; function removeBomStream(encoding) { encoding = (encoding || '').toLowerCase(); @@ -35,12 +35,12 @@ function removeBomStream(encoding) { if (chunk !== '') { chunk += decoder.decode(); // end of stream mode and clear inner buffer. - var buffer = Buffer.from(chunk, 'utf-8'); - // Node<=v12, TextDecoder#decode returns a BOM if it receives a BOM separately. // Ref https://github.com/nodejs/node/pull/30132 - if (BOM.compare(buffer) !== 0) { + if (chunk !== BOM) { state = 1; + var buffer = Buffer.from(chunk, 'utf-8'); + cb(null, buffer); return; } From 6aa11cf7aaa8967990e6491dde5d526b4d078f85 Mon Sep 17 00:00:00 2001 From: Blaine Bublitz Date: Mon, 18 Apr 2022 14:26:09 -0700 Subject: [PATCH 7/7] Update index.js --- index.js | 1 + 1 file changed, 1 insertion(+) diff --git a/index.js b/index.js index dab60b8..7a707b5 100644 --- a/index.js +++ b/index.js @@ -9,6 +9,7 @@ function removeBomStream(encoding) { encoding = (encoding || '').toLowerCase(); var isUTF8 = (encoding === 'utf-8' || encoding === 'utf8'); + // Needed due to https://github.com/nodejs/node/pull/42779 if (!isUTF8) { return through(); }