From 251c1785a6e3b916bbde38b97de6d608d7a029ff Mon Sep 17 00:00:00 2001
From: John Reeves <github@jonnyreeves.co.uk>
Date: Fri, 26 Aug 2016 15:11:42 +0100
Subject: [PATCH] Add support for binary data (#12) (#15)

* Add support for binary data (#12)

* Request implementations now supply a `Uint8Array` of binary data through to the `chunkParser`.
* `defaultChunkParser` converts the `chunkBytes` into a string for processing
* Overhauled how parserState is handled; no longer assume that the `chunkParser` will emit a string; instead we use a `state` object whose contract is wholly owned by the parser (we just shuffle it around internally).
* Added test-coverage for UTF-8 characters
* Added dependency on `utf-8` lib for handling (un)marhsalling of binary data to text.
* Updated documentation
* Updated npm dependencies.

Inspiration provided by @mwitkow and @ariutta -- many thanks! :)

* Use TextEncoder / Decoder over `utf-8`.

* Update README to convey which polyfills are required for crappy browsers.

* Remove typearray polyfill

Would appear that IE10 has typedarray support :)

* Add test to ensure we always get a Uint8Array supplied.

* Instantiate TextEncoder/Decoder only once. Use stream option.

* Patch up flakey test.

Browsers using the fallback `xhr` transport should expect an addtional call to their `chunkParser` as we flush out the state of the `TextEncoder` when the XHR connection is closed.

* Remove left over utility methods.
---
 .eslintrc.json                     |  8 +++
 README.md                          | 79 +++++++++++++++++++++++++++---
 karma.conf.js                      |  2 +
 package.json                       | 35 ++++++++-----
 src/.eslintrc.json                 |  6 ---
 src/defaultChunkParser.js          | 28 +++++++----
 src/impl/fetch.js                  |  3 +-
 src/impl/mozXhr.js                 |  9 +---
 src/impl/xhr.js                    |  7 ++-
 src/index.js                       | 23 ++++-----
 src/util.js                        |  2 +-
 test/integ/.eslintrc.json          |  4 +-
 test/integ/chunked-request.spec.js | 48 ++++++++++++++++--
 test/server/index.js               |  9 ++++
 14 files changed, 198 insertions(+), 65 deletions(-)

diff --git a/.eslintrc.json b/.eslintrc.json
index 78ba093..00678de 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -5,6 +5,14 @@
     "no-var": 2,
     "prefer-const": 2
   },
+  "env": {
+    "browser": true
+  },
+  "globals": {
+    "Uint8Array": false,
+    "TextEncoder": false,
+    "TextDecoder": false
+  },
   "parserOptions": {
     "ecmaVersion": 6,
     "sourceType": "module"
diff --git a/README.md b/README.md
index 6810842..b08162e 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,11 @@ $ npm install chunked-request
 
 or as a standalone ES5 browser script by obtaining `dist/chunked-request.js` from a [tagged release](https://github.com/jonnyreeves/chunked-request/releases).
 
+## Browser Support
+This library is tested against IE 10, Safari, Firefox and Chrome.  It relies on browser support for [TypedArray](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypedArray), [TextDecoder](https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder) and [TextDecoder](https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder) Browser APIs; for legacy environments such as Safari and IE10, you will need to supply one or more of the polyfills listed below:
+
+* [TextEncoder / TextDecoder Polyfill](https://www.npmjs.com/package/text-encoding) (IE10, Safari)
+
 ## API
 
 ```js
@@ -52,16 +57,14 @@ Determine if HTTP cookies will be sent along with the request, one of `same-orig
 A function which implements the following interface:
 
 ```js
-(rawChunk, previousChunkSuffix, isFinalChunk) => [ parsedChunk, chunkSuffix ]
+(chunkBytes, state, flush) => [ parsed, state ]
 ```
 
-The `chunkParser` takes the raw, textual chunk response returned by the server and converts it into the value passed to the `onChunk` callback (see `options.onChunk`).  The function may also yield an optional chunkSuffix which will be not be passed to the `onChunk` callback but will instead be supplied as the `previousChunkSuffix` value the next time the `chunkParser` is invoked.
-
-If the `chunkParser` throws an exception, the chunk will be discarded and the error that was raised will be passed to the `onChunk` callback augmented with a `rawChunk` property consisting of the textual chunk for logging / recovery.
+The chunk parser converts the supplied Uint8Array of bytes into structured data which will be supplied to the `onChunk` callback.  If no `chunkParser` function is supplied the `defaultChunkParser` will be used which expects the data to be JSON literals delimited by newline (`\\n`) characters.
 
-If no `chunkParser` is supplied the `defaultChunkParser` will be used which expects the chunks returned by the server to consist of one or more `\n` delimited lines of JSON object literals which are parsed into an Array.
+See [Writing a Custom Chunk Parser](#Writing a Custom Chunk Parser) below for more deatils on how to implement this interface.
 
-`chunkParser` will be called with `isFinalChunk` as `true` when the response has completed and there was a non-empty `chunkSuffix` from the last chunk. The `rawChunk` will be an empty string and the `previousChunkSuffix` will be the last returned `chunkSuffix`.
+If the `chunkParser` throws an exception, the chunk will be discarded and the error that was raised will be passed to the `onChunk` callback augmented with a `chunkBytes` property that contains the byte Array supplied to the parser and a `parserState` property which contains the state that was supplied (see below).
 
 #### onChunk (optional)
 A function which implements the following interface:
@@ -94,6 +97,68 @@ A function which implements the following interface:
 ({ url, headers, method, body, credentials, onComplete, onRawChunk }) => undefined
 ```
 
-The underlying function to use to make the request, see the provided implementations if you wish to provide a custom extension.
+The underlying function used to make the request, see the provided implementations if you wish to provide a custom extension.  Note that you must supply a Uint8Array to the `onRawChunk` callback.
 
 If no value is supplied the `chunkedRequest.transportFactory` function will be invoked to determine which transport method to use.  The default `transportFactory` will attempt to select the best available method for the current platform; but you can override this method for substituting a test-double or custom implementation.
+
+
+## Writing a Custom Chunk Parser
+The `chunkParser` takes a 'chunk' of bytes in the form of a `Uint8Array` which were provided by the remote server and then converts it into the value passed to the `onChunk` callback (see `options.onChunk`).  In it's simplest form the `chunkParser` acts as a passthru; the following example converts the supplied bytes into a string:
+
+```js
+chunkedRequest({
+  chunkParser(bytes) {
+    const str = utf8BytesToString(bytes);
+    return [ str ];
+  }
+  onChunk(err, str) {
+    console.log(`Chunk recieved: ${str}`);
+  }
+}
+```
+
+
+Chunk Parsers will typically be dealing with structured data (eg: JSON literals) where a message can only be parsed if it is well formed (ie: a complete JSON literal).  Because of the nature of chunked transfer, the server may end up flushing a chunk of data to the browser that contains an incomplete datastructure.  The example below illustrates this where the first chunk from the server (Chunk 1) has an incomplete JSON literal which is subsiquently completed by the proceeding chunk (Chunk 2).
+
+```
+Server (Chunk 1)> { "name": "Jonny" }\n{ "name": "Frank" }\n{ "na
+Server (Chunk 2)> me": "Bob" }
+```
+
+A naieve chunk parser implementation would attempt to parse the JSON literals contained in each chunk like so:
+
+```js
+chunkParser(bytes) {
+  const jsonLiterals = utf8BytesToString(bytes).split("\n");
+  // This will not work; Array index 2 `'{ "nam' is an incomplete JSON
+  // literal and will cause a SyntaxError from JSON.parse
+  return [ jsonLiterals.map(v => JSON.parse(v)) ];
+}
+```
+
+Instead, the chunkParser should make use of the `state` object to retain any incomplete messages so they can be processed in the next pass:
+
+```js
+chunkParser(bytes, state = {}) {
+  const jsonLiterals = utf8BytesToString(bytes).split("\n");
+
+  // Does the state object contain any data that was not parsed
+  // in a previous pass (see below).
+  if (state.trailer) {
+    // Glue the data back together for a (potentially) complete literal.
+    jsonLiterals[0] = `${state.trailer}${jsonLiterals[0]}`;
+  }
+  
+  // Check to see if the last literal parsed from this chunk ended with a 
+  // message delimiter.
+  if (jsonLiterals[jsonLiterals.length-1] !== "\n") {
+    // move the last entry into the parser's state as it's incomplete; we
+    // can process it on the next pass.
+    state.trailer = jsonLiterals.pop();
+  }
+
+  return [ jsonLiterals.map(v => JSON.parse(v)), state ];
+}
+```
+
+Finally, stateful chunk parsers must observe the third argument, `flush`.  This flag will be true when the server has closed the conneciton indicating that there will be no further data.  The chunkParser must process any remaining data in the state object at this point.
\ No newline at end of file
diff --git a/karma.conf.js b/karma.conf.js
index f08bffe..17fe04c 100644
--- a/karma.conf.js
+++ b/karma.conf.js
@@ -68,6 +68,7 @@ module.exports = function(config) {
 
     // list of files / patterns to load in the browser
     files: [
+      'node_modules/text-encoding/lib/encoding.js',
       'build/integration-tests.js'
     ],
 
@@ -78,6 +79,7 @@ module.exports = function(config) {
 
     proxies: {
       '/chunked-response': 'http://localhost:2001/chunked-response',
+      '/chunked-utf8-response': 'http://localhost:2001/chunked-utf8-response',
       '/split-chunked-response': 'http://localhost:2001/split-chunked-response',
       '/error-response': 'http://localhost:2001/error-response',
       '/echo-response': 'http://localhost:2001/echo-response'
diff --git a/package.json b/package.json
index 77760fb..2ced622 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,14 @@
   "jsnext:main": "src/index.js",
   "repository": "https://github.com/jonnyreeves/chunked-request",
   "license": "MIT",
-  "keywords": [ "request", "chunked", "transfer", "comet", "xhr", "fetch" ],
+  "keywords": [
+    "request",
+    "chunked",
+    "transfer",
+    "comet",
+    "xhr",
+    "fetch"
+  ],
   "scripts": {
     "prepublish": "npm run clean && npm run build:lib",
     "clean": "rm -rf build/*",
@@ -17,19 +24,21 @@
     "release": "./release.sh ${npm_package_version}"
   },
   "devDependencies": {
-    "babel-cli": "^6.6.5",
-    "babel-preset-es2015": "^6.6.0",
-    "babelify": "^7.2.0",
-    "browserify": "^13.0.0",
-    "cookie": "^0.2.3",
-    "eslint": "^2.4.0",
+    "babel-cli": "^6.11.4",
+    "babel-preset-es2015": "^6.13.2",
+    "babelify": "^7.3.0",
+    "browserify": "^13.1.0",
+    "cookie": "^0.3.1",
+    "eslint": "^3.3.1",
     "jasmine": "^2.4.1",
     "jasmine-core": "^2.4.1",
-    "karma": "^0.13.22",
-    "karma-chrome-launcher": "^0.2.2",
-    "karma-jasmine": "^0.3.8",
-    "karma-sauce-launcher": "^0.3.1",
-    "lodash": "^4.6.1",
+    "karma": "^1.2.0",
+    "karma-chrome-launcher": "^1.0.1",
+    "karma-jasmine": "^1.0.2",
+    "karma-sauce-launcher": "^1.0.0",
+    "lodash": "^4.15.0",
+    "text-encoding": "^0.6.0",
     "url": "^0.11.0"
-  }
+  },
+  "dependencies": {}
 }
diff --git a/src/.eslintrc.json b/src/.eslintrc.json
index c2cc46d..d363116 100644
--- a/src/.eslintrc.json
+++ b/src/.eslintrc.json
@@ -1,10 +1,4 @@
 {
-  "env": {
-    "browser": true
-  },
-  "globals": {
-    "Uint8Array": false
-  },
   "rules": {
     "no-var": 2
   }
diff --git a/src/defaultChunkParser.js b/src/defaultChunkParser.js
index c6f63c3..e11230b 100644
--- a/src/defaultChunkParser.js
+++ b/src/defaultChunkParser.js
@@ -3,26 +3,34 @@ const entryDelimiter = '\n';
 // The defaultChunkParser expects the response from the server to consist of new-line
 // delimited JSON, eg:
 //
-//  { "chunk": "#1", "data": "Hello" }
+//  { "chunk": "#1", "data": "Hello" }\n
 //  { "chunk": "#2", "data": "World" }
 //
 // It will correctly handle the case where a chunk is emitted by the server across
 // delimiter boundaries.
-export default function defaultChunkParser(rawChunk, prevChunkSuffix = '', isFinalChunk = false) {
-  let chunkSuffix;
-
-  const rawChunks = `${prevChunkSuffix}${rawChunk}`
-    .split(entryDelimiter);
+export default function defaultChunkParser(bytes, state = {}, flush = false) {
+  if (!state.textDecoder) {
+    state.textDecoder = new TextDecoder();
+  }
+  const textDecoder = state.textDecoder;
+  const chunkStr = textDecoder.decode(bytes, { stream: !flush })
+  const jsonLiterals = chunkStr.split(entryDelimiter);
+  if (state.trailer) {
+    jsonLiterals[0] = `${state.trailer}${jsonLiterals[0]}`;
+    state.trailer = '';
+  }
 
-  if (!isFinalChunk && !hasSuffix(rawChunk, entryDelimiter)) {
-    chunkSuffix = rawChunks.pop();
+  // Is this a complete message?  If not; push the trailing (incomplete) string 
+  // into the state. 
+  if (!flush && !hasSuffix(chunkStr, entryDelimiter)) {
+    state.trailer = jsonLiterals.pop();
   }
 
-  const processedChunks = rawChunks
+  const jsonObjects = jsonLiterals
     .filter(v => v.trim() !== '')
     .map(v => JSON.parse(v));
 
-  return [ processedChunks, chunkSuffix ];
+  return [ jsonObjects, state ];
 }
 
 function hasSuffix(s, suffix) {
diff --git a/src/impl/fetch.js b/src/impl/fetch.js
index ad1d818..3d4eaed 100644
--- a/src/impl/fetch.js
+++ b/src/impl/fetch.js
@@ -3,7 +3,6 @@ import { isObject } from '../util';
 export const READABLE_BYTE_STREAM = 'readable-byte-stream';
 
 export default function fetchRequest(options) {
-  const decoder = new TextDecoder();
   const { onRawChunk, onRawComplete, method, body, credentials } = options;
   const headers = marshallHeaders(options.headers);
 
@@ -17,7 +16,7 @@ export default function fetchRequest(options) {
             raw: res
           });
         }
-        onRawChunk(decoder.decode(result.value));
+        onRawChunk(result.value);
         return pump(reader, res);
       });
   }
diff --git a/src/impl/mozXhr.js b/src/impl/mozXhr.js
index 3b1fb86..c46dc55 100644
--- a/src/impl/mozXhr.js
+++ b/src/impl/mozXhr.js
@@ -4,14 +4,7 @@ export default function mozXhrRequest(options) {
   const xhr = new XMLHttpRequest();
 
   function onProgressEvent() {
-    const view = new Uint8Array(xhr.response);
-    let len = view.length;
-
-    const rawString = new Array(len);
-    while(len--) {
-      rawString[len] = String.fromCharCode(view[len]);
-    }
-    options.onRawChunk(rawString.join(''));
+    options.onRawChunk(new Uint8Array(xhr.response));
   }
 
   function onLoadEvent() {
diff --git a/src/impl/xhr.js b/src/impl/xhr.js
index 6db125e..b1d115f 100644
--- a/src/impl/xhr.js
+++ b/src/impl/xhr.js
@@ -1,16 +1,19 @@
 export const XHR = 'xhr';
 
 export default function xhrRequest(options) {
+  const textEncoder = new TextEncoder();
   const xhr = new XMLHttpRequest();
   let index = 0;
 
   function onProgressEvent() {
-    const rawChunk = xhr.responseText.substr(index);
+    const rawText = xhr.responseText.substr(index);
     index = xhr.responseText.length;
-    options.onRawChunk(rawChunk);
+    options.onRawChunk(textEncoder.encode(rawText, { stream: true }));
   }
 
   function onLoadEvent() {
+    // Force the textEncoder to flush.
+    options.onRawChunk(textEncoder.encode(null, { stream: false }));
     options.onRawComplete({
       statusCode: xhr.status,
       transport: XHR,
diff --git a/src/index.js b/src/index.js
index 07a89bb..afc2611 100644
--- a/src/index.js
+++ b/src/index.js
@@ -20,31 +20,32 @@ export default function chunkedRequest(options) {
     chunkParser = defaultChunkParser
   } = options;
 
-  let prevChunkSuffix = "";
+  // parserState can be utilised by the chunkParser to hold on to state; the
+  // defaultChunkParser uses it to keep track of any trailing text the last
+  // delimiter in the chunk.  There is no contract for parserState.
+  let parserState;
 
-  function processRawChunk(rawChunk, isFinalChunk = false) {
+  function processRawChunk(chunkBytes, flush = false) {
     let parsedChunks = null;
     let parseError = null;
-    let suffix = "";
 
     try {
-      [ parsedChunks, suffix ] = chunkParser(rawChunk, prevChunkSuffix, isFinalChunk);
-      prevChunkSuffix = suffix || "";
+      [ parsedChunks, parserState ] = chunkParser(chunkBytes, parserState, flush);
     } catch (e) {
       parseError = e;
-      parseError.rawChunk = rawChunk;
-      parseError.prevChunkSuffix = prevChunkSuffix;
+      parseError.chunkBytes = chunkBytes;
+      parseError.parserState = parserState;
     } finally {
-      if (parseError || (parsedChunks !== null && parsedChunks.length > 0)) {
+      if (parseError || (parsedChunks && parsedChunks.length > 0)) {
         onChunk(parseError, parsedChunks);
       }
     }
   }
 
   function processRawComplete(rawComplete) {
-    if (prevChunkSuffix != "") {
-      // Call the parser with isFinalChunk=true to flush the prevChunkSuffix
-      processRawChunk("", true);
+    if (parserState) {
+      // Flush the parser to process any remaining state.
+      processRawChunk(new Uint8Array(), true);
     }
     onComplete(rawComplete);
   }
diff --git a/src/util.js b/src/util.js
index c5dd590..98a44c1 100644
--- a/src/util.js
+++ b/src/util.js
@@ -4,4 +4,4 @@ export function isObject(value) {
 
 export  function noop() {
   /* No operation */
-}
\ No newline at end of file
+}
diff --git a/test/integ/.eslintrc.json b/test/integ/.eslintrc.json
index c44372d..9d79619 100644
--- a/test/integ/.eslintrc.json
+++ b/test/integ/.eslintrc.json
@@ -1,6 +1,8 @@
 {
   "env": {
-    "browser": true,
     "jasmine": true
+  },
+  "rules": {
+    "no-console": 0
   }
 }
\ No newline at end of file
diff --git a/test/integ/chunked-request.spec.js b/test/integ/chunked-request.spec.js
index bf6a8fe..1e3930e 100644
--- a/test/integ/chunked-request.spec.js
+++ b/test/integ/chunked-request.spec.js
@@ -24,7 +24,42 @@ describe('chunked-request', () => {
       url: `/chunked-response?numChunks=1&entriesPerChunk=1&delimitLast=1`,
       onChunk: (err, chunk) => receivedChunks.push(err || chunk),
       onComplete
-    })
+    });
+  });
+
+  it('should supply a Uint8Array to the chunkParser', done => {
+    let actual = false;
+
+    const onComplete = () => {
+      expect(actual).toBe(true);
+      done();
+    };
+
+    chunkedRequest({
+      url: `/chunked-response?numChunks=1&entriesPerChunk=1&delimitLast=1`,
+      chunkParser: bytes => { actual = (bytes instanceof Uint8Array); },
+      onComplete
+    });
+  });
+
+  it('should parse utf8 responses', done => {
+    const receivedChunks = [];
+
+    const onComplete = () => {
+      const chunkErrors = receivedChunks.filter(v => v instanceof Error);
+
+      expect(receivedChunks.length).toBe(1, 'receivedChunks');
+      expect(chunkErrors.length).toBe(0, 'of which errors');
+      expect(isEqual(receivedChunks, [ [ {message: "𝌆"} ] ])).toBe(true, 'parsed chunks');
+
+      done();
+    };
+
+    chunkedRequest({
+      url: `/chunked-utf8-response`,
+      onChunk: (err, chunk) => receivedChunks.push(err || chunk),
+      onComplete
+    });
   });
 
   it('should parse a response that consists of two chunks and ends with a delimiter', done => {
@@ -111,15 +146,20 @@ describe('chunked-request', () => {
       const chunkErrors = receivedChunks.filter(v => v instanceof Error);
       expect(chunkErrors.length).toBe(1, 'one errors caught');
       expect(chunkErrors[0].message).toBe('expected');
-      expect(chunkErrors[0].rawChunk).toBe(`{ "chunk": "#1", "data": "#0" }\n`);
+
+      const rawChunkStr = new TextDecoder().decode(chunkErrors[0].chunkBytes);
+      expect(rawChunkStr).toBe(`{ "chunk": "#1", "data": "#0" }\n`);
       
       done();
     };
 
     chunkedRequest({
       url: `/chunked-response?numChunks=1&entriesPerChunk=1&delimitLast=1`,
-      chunkParser: () => {
-        throw new Error("expected");
+      chunkParser: (chunkBytes, state, flush) => {
+        if (chunkBytes.length > 0 && !flush) {
+          throw new Error("expected");
+        }
+        return [];
       },
       onChunk: (err, chunk) => {
         receivedChunks.push(err || chunk)
diff --git a/test/server/index.js b/test/server/index.js
index 92e6304..2c9882d 100644
--- a/test/server/index.js
+++ b/test/server/index.js
@@ -66,6 +66,13 @@ function serveSplitChunkedResponse(req, res) {
   }, chunkIntervalMs);
 }
 
+function serveChunkedUtf8Response(req, res) {
+  res.setHeader('Content-Type', 'text/html; charset=UTF-8');
+  res.setHeader('Transfer-Encoding', 'chunked');
+  res.write(JSON.stringify({ "message": "𝌆" }) + "\n");
+  res.end();
+}
+
 function serveChunkedResponse(req, res) {
   const query = url.parse(req.url, true).query;
   const numChunks = parseInt(query.numChunks, 10) || 4;
@@ -108,6 +115,8 @@ function handler(req, res) {
   switch (req.parsedUrl.pathname) {
   case '/chunked-response':
     return serveChunkedResponse(req, res);
+  case '/chunked-utf8-response':
+    return serveChunkedUtf8Response(req, res);
   case '/split-chunked-response':
     return serveSplitChunkedResponse(req, res);
   case '/echo-response':