Skip to content

Commit

Permalink
Merge pull request #375 from yetzt/patch-1
Browse files Browse the repository at this point in the history
Use `iconv.decodeStream` instead of `iconv.decode`; Fixes #374
  • Loading branch information
tomas authored Apr 7, 2022
2 parents 3ac711c + 9c4edac commit 7d45c46
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 13 deletions.
35 changes: 22 additions & 13 deletions lib/decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,33 @@ function StreamDecoder(charset) {
StreamDecoder.prototype._transform = function(chunk, encoding, done) {
var res, found;

// try get charset from chunk, just once
if (this.charset == 'utf8' && !this.parsed_chunk) {
// try to get charset from chunk, just once
if (!this.parsed_chunk && (this.charset == 'utf8' || this.charset == 'utf-8')) {
this.parsed_chunk = true;

var matches = regex.exec(chunk.toString());
if (matches) {
found = matches[1].toLowerCase();
this.charset = found == 'utf-8' ? 'utf8' : found;
// look for charset
if (regex.test(chunk.toString())) {
var charset = (RegExp.$1).toLowerCase().replace('utf8','utf-8'); // canonicalize
// override if iconv can handle it
if (iconv.encodingExists(charset)) this.charset = charset;
}
}

try {
res = iconv.decode(chunk, this.charset);
} catch(e) { // something went wrong, just return original chunk
res = chunk;
}

this.push(res);
// no need to decode utf-8, pass through
if (this.charset == 'utf-8') return this.push(chunk), done();

// initialize stream decoder if not present
const self = this;
if (!this.decoder) {
this.decoder = iconv.decodeStream(this.charset);
this.decoder.on("data", function(decoded_chunk){
// push decoded chunk
self.push(decoded_chunk);
});
};

// write chunk to decoder
this.decoder.write(chunk);
done();
}

Expand Down
94 changes: 94 additions & 0 deletions test/decoder_spec.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var should = require('should'),
needle = require('./../'),
decoder = require('./../lib/decoder'),
Q = require('q'),
chardet = require('jschardet'),
fs = require('fs'),
Expand Down Expand Up @@ -151,4 +152,97 @@ describe('character encoding', function() {
})

})

describe('multibyte characters split across chunks', function () {

describe('with encoding = utf-8', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('utf-8');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in utf-8 split across chunks
d.write(Buffer.from([0xE6]));
d.write(Buffer.from([0x85]));
d.write(Buffer.from([0xB6]));
d.end();

})
})

describe('with encoding = euc-jp', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('euc-jp');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in euc-jp split across chunks
d.write(Buffer.from([0xB7]));
d.write(Buffer.from([0xC4]));
d.end();

})
})

describe('with encoding = gb18030', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('gb18030');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in gb18030 split across chunks
d.write(Buffer.from([0x91]));
d.write(Buffer.from([0x63]));
d.end();

})
})

})

})

0 comments on commit 7d45c46

Please sign in to comment.