Merge pull request #375 from yetzt/patch-1

Use `iconv.decodeStream` instead of `iconv.decode`; Fixes #374
tomas · Apr 7, 2022 · 7d45c46 · 7d45c46
2 parents 3ac711c + 9c4edac
commit 7d45c46
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 13 deletions.
diff --git a/lib/decoder.js b/lib/decoder.js
@@ -18,24 +18,33 @@ function StreamDecoder(charset) {
 StreamDecoder.prototype._transform = function(chunk, encoding, done) {
   var res, found;
 
-  // try get charset from chunk, just once
-  if (this.charset == 'utf8' && !this.parsed_chunk) {
+  // try to get charset from chunk, just once
+  if (!this.parsed_chunk && (this.charset == 'utf8' || this.charset == 'utf-8')) {
     this.parsed_chunk = true;
 
-    var matches = regex.exec(chunk.toString());
-    if (matches) {
-      found = matches[1].toLowerCase();
-      this.charset = found == 'utf-8' ? 'utf8' : found;
+    // look for charset
+    if (regex.test(chunk.toString())) {
+      var charset = (RegExp.$1).toLowerCase().replace('utf8','utf-8'); // canonicalize
+      // override if iconv can handle it
+      if (iconv.encodingExists(charset)) this.charset = charset;
     }
   }
 
-  try {
-    res = iconv.decode(chunk, this.charset);
-  } catch(e) { // something went wrong, just return original chunk
-    res = chunk;
-  }
-
-  this.push(res);
+  // no need to decode utf-8, pass through
+  if (this.charset == 'utf-8') return this.push(chunk), done();
+
+  // initialize stream decoder if not present
+  const self = this;
+  if (!this.decoder) {
+    this.decoder = iconv.decodeStream(this.charset);
+    this.decoder.on("data", function(decoded_chunk){
+      // push decoded chunk
+      self.push(decoded_chunk);
+    });
+  };
+
+  // write chunk to decoder
+  this.decoder.write(chunk);
   done();
 }
 

diff --git a/test/decoder_spec.js b/test/decoder_spec.js
@@ -1,5 +1,6 @@
 var should  = require('should'),
     needle  = require('./../'),
+    decoder = require('./../lib/decoder'),
     Q       = require('q'),
     chardet = require('jschardet'),
     fs      = require('fs'),
@@ -151,4 +152,97 @@ describe('character encoding', function() {
     })
 
   })
+
+  describe('multibyte characters split across chunks', function () {
+
+    describe('with encoding = utf-8', function() {
+
+      var d, 
+        result = Buffer.allocUnsafe(0);
+
+      before(function(done) {
+        d = decoder('utf-8');
+        done();
+      });
+
+      it('reassembles split multibyte characters', function (done) {
+
+        d.on("data", function(chunk){
+          result = Buffer.concat([ result, chunk ]);
+        });
+
+        d.on("end", function(){
+          result.toString("utf-8").should.eql('慶');
+          done();
+        });
+
+        // write '慶' in utf-8 split across chunks
+        d.write(Buffer.from([0xE6]));
+        d.write(Buffer.from([0x85]));
+        d.write(Buffer.from([0xB6]));
+        d.end();
+
+      })
+    })
+
+    describe('with encoding = euc-jp', function() {
+
+      var d, 
+        result = Buffer.allocUnsafe(0);
+
+      before(function(done) {
+        d = decoder('euc-jp');
+        done();
+      });
+
+      it('reassembles split multibyte characters', function (done) {
+
+        d.on("data", function(chunk){
+          result = Buffer.concat([ result, chunk ]);
+        });
+
+        d.on("end", function(){
+          result.toString("utf-8").should.eql('慶');
+          done();
+        });
+
+        // write '慶' in euc-jp split across chunks
+        d.write(Buffer.from([0xB7]));
+        d.write(Buffer.from([0xC4]));
+        d.end();
+
+      })
+    })
+
+    describe('with encoding = gb18030', function() {
+
+      var d, 
+        result = Buffer.allocUnsafe(0);
+
+      before(function(done) {
+        d = decoder('gb18030');
+        done();
+      });
+
+      it('reassembles split multibyte characters', function (done) {
+
+        d.on("data", function(chunk){
+          result = Buffer.concat([ result, chunk ]);
+        });
+
+        d.on("end", function(){
+          result.toString("utf-8").should.eql('慶');
+          done();
+        });
+
+        // write '慶' in gb18030 split across chunks
+        d.write(Buffer.from([0x91]));
+        d.write(Buffer.from([0x63]));
+        d.end();
+
+      })
+    })
+
+  })
+
 })