From 9da168b71fb729635ad71e839630480e815623d0 Mon Sep 17 00:00:00 2001
From: Brendan Ashworth <brendan.ashworth@me.com>
Date: Fri, 15 May 2015 19:24:34 -0700
Subject: [PATCH] buffer: optimize Buffer.byteLength

Buffer.byteLength is important for speed because it is called whenever a
new Buffer is created from a string.

This commit optimizes Buffer.byteLength execution by:
- moving base64 length calculation into JS-land, which is now much
  faster
- remove redundant code and streamline the UTF8 length calculation

It also adds a benchmark and better tests.

PR-URL: https://github.com/nodejs/io.js/pull/1713
Reviewed-By: Trevor Norris <trev.norris@gmail.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
---
 benchmark/buffers/buffer-bytelength.js  | 55 +++++++++++++++++++++
 lib/buffer.js                           | 66 ++++++++++++++++++-------
 src/node_buffer.cc                      | 16 ++----
 test/parallel/test-buffer-bytelength.js | 46 +++++++++++++++++
 test/parallel/test-buffer.js            | 13 -----
 5 files changed, 155 insertions(+), 41 deletions(-)
 create mode 100644 benchmark/buffers/buffer-bytelength.js
 create mode 100644 test/parallel/test-buffer-bytelength.js

diff --git a/benchmark/buffers/buffer-bytelength.js b/benchmark/buffers/buffer-bytelength.js
new file mode 100644
index 00000000000000..6a7afe6921aea1
--- /dev/null
+++ b/benchmark/buffers/buffer-bytelength.js
@@ -0,0 +1,55 @@
+var common = require('../common');
+
+var bench = common.createBenchmark(main, {
+  encoding: ['utf8', 'base64'],
+  len: [1, 2, 4, 16, 64, 256], // x16
+  n: [5e6]
+});
+
+// 16 chars each
+var chars = [
+  'hello brendan!!!', // 1 byte
+  'ΰαβγδεζηθικλμνξο', // 2 bytes
+  '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
+  '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
+];
+
+function main(conf) {
+  var n = conf.n | 0;
+  var len = conf.len | 0;
+  var encoding = conf.encoding;
+
+  var strings = [];
+  for (var string of chars) {
+    // Strings must be built differently, depending on encoding
+    var data = buildString(string, len);
+    if (encoding === 'utf8') {
+      strings.push(data);
+    } else if (encoding === 'base64') {
+      // Base64 strings will be much longer than their UTF8 counterparts
+      strings.push(new Buffer(data, 'utf8').toString('base64'));
+    }
+  }
+
+  // Check the result to ensure it is *properly* optimized
+  var results = strings.map(function(val) {
+    return Buffer.byteLength(val, encoding);
+  });
+
+  bench.start();
+  for (var i = 0; i < n; i++) {
+    var index = n % strings.length;
+    // Go!
+    var r = Buffer.byteLength(strings[index], encoding);
+
+    if (r !== results[index])
+      throw Error('incorrect return value');
+  }
+  bench.end(n);
+}
+
+function buildString(str, times) {
+  if (times == 1) return str;
+
+  return str + buildString(str, times - 1);
+}
diff --git a/lib/buffer.js b/lib/buffer.js
index dc2b656d7a7a33..1b9c68465d6b03 100644
--- a/lib/buffer.js
+++ b/lib/buffer.js
@@ -272,30 +272,62 @@ Buffer.concat = function(list, length) {
 };
 
 
+function base64ByteLength(str, bytes) {
+  // Handle padding
+  if (str.charCodeAt(bytes - 1) === 0x3D)
+    bytes--;
+  if (bytes > 1 && str.charCodeAt(bytes - 1) === 0x3D)
+    bytes--;
+
+  // Base64 ratio: 3/4
+  return (bytes * 3) >>> 2;
+}
+
+
 function byteLength(string, encoding) {
-  if (typeof(string) !== 'string')
-    string = String(string);
+  if (typeof string !== 'string')
+    string = '' + string;
 
-  if (string.length === 0)
+  var len = string.length;
+  if (len === 0)
     return 0;
 
-  switch (encoding) {
-    case 'ascii':
-    case 'binary':
-    case 'raw':
-      return string.length;
+  // Use a for loop to avoid recursion
+  var loweredCase = false;
+  for (;;) {
+    switch (encoding) {
+      case 'ascii':
+      case 'binary':
+      // Deprecated
+      case 'raw':
+      case 'raws':
+        return len;
 
-    case 'ucs2':
-    case 'ucs-2':
-    case 'utf16le':
-    case 'utf-16le':
-      return string.length * 2;
+      case 'utf8':
+      case 'utf-8':
+        return binding.byteLengthUtf8(string);
 
-    case 'hex':
-      return string.length >>> 1;
-  }
+      case 'ucs2':
+      case 'ucs-2':
+      case 'utf16le':
+      case 'utf-16le':
+        return len * 2;
+
+      case 'hex':
+        return len >>> 1;
 
-  return binding.byteLength(string, encoding);
+      case 'base64':
+        return base64ByteLength(string, len);
+
+      default:
+        // The C++ binding defaulted to UTF8, we should too.
+        if (loweredCase)
+          return binding.byteLengthUtf8(string);
+
+        encoding = ('' + encoding).toLowerCase();
+        loweredCase = true;
+    }
+  }
 }
 
 Buffer.byteLength = byteLength;
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index bd02279212583a..fca08599e50feb 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -541,17 +541,11 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
 }
 
 
-void ByteLength(const FunctionCallbackInfo<Value> &args) {
-  Environment* env = Environment::GetCurrent(args);
-
-  if (!args[0]->IsString())
-    return env->ThrowTypeError("Argument must be a string");
-
-  Local<String> s = args[0]->ToString(env->isolate());
-  enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8);
+void ByteLengthUtf8(const FunctionCallbackInfo<Value> &args) {
+  CHECK(args[0]->IsString());
 
-  uint32_t size = StringBytes::Size(env->isolate(), s, e);
-  args.GetReturnValue().Set(size);
+  // Fast case: avoid StringBytes on UTF8 string. Jump to v8.
+  args.GetReturnValue().Set(args[0].As<String>()->Utf8Length());
 }
 
 
@@ -745,7 +739,7 @@ void Initialize(Handle<Object> target,
 
   env->SetMethod(target, "setupBufferJS", SetupBufferJS);
 
-  env->SetMethod(target, "byteLength", ByteLength);
+  env->SetMethod(target, "byteLengthUtf8", ByteLengthUtf8);
   env->SetMethod(target, "compare", Compare);
   env->SetMethod(target, "fill", Fill);
   env->SetMethod(target, "indexOfBuffer", IndexOfBuffer);
diff --git a/test/parallel/test-buffer-bytelength.js b/test/parallel/test-buffer-bytelength.js
new file mode 100644
index 00000000000000..c136c62808e8fc
--- /dev/null
+++ b/test/parallel/test-buffer-bytelength.js
@@ -0,0 +1,46 @@
+'use strict';
+
+var common = require('../common');
+var assert = require('assert');
+var Buffer = require('buffer').Buffer;
+
+// coerce values to string
+assert.equal(Buffer.byteLength(32, 'raw'), 2);
+assert.equal(Buffer.byteLength(NaN, 'utf8'), 3);
+assert.equal(Buffer.byteLength({}, 'raws'), 15);
+assert.equal(Buffer.byteLength(), 9);
+
+// special case: zero length string
+assert.equal(Buffer.byteLength('', 'ascii'), 0);
+assert.equal(Buffer.byteLength('', 'HeX'), 0);
+
+// utf8
+assert.equal(Buffer.byteLength('∑éllö wørl∂!', 'utf-8'), 19);
+assert.equal(Buffer.byteLength('κλμνξο', 'utf8'), 12);
+assert.equal(Buffer.byteLength('挵挶挷挸挹', 'utf-8'), 15);
+assert.equal(Buffer.byteLength('𠝹𠱓𠱸', 'UTF8'), 12);
+// without an encoding, utf8 should be assumed
+assert.equal(Buffer.byteLength('hey there'), 9);
+assert.equal(Buffer.byteLength('𠱸挶νξ#xx :)'), 17);
+assert.equal(Buffer.byteLength('hello world', ''), 11);
+// it should also be assumed with unrecognized encoding
+assert.equal(Buffer.byteLength('hello world', 'abc'), 11);
+assert.equal(Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'), 10);
+
+// base64
+assert.equal(Buffer.byteLength('aGVsbG8gd29ybGQ=', 'base64'), 11);
+assert.equal(Buffer.byteLength('bm9kZS5qcyByb2NrcyE=', 'base64'), 14);
+assert.equal(Buffer.byteLength('aGkk', 'base64'), 3);
+assert.equal(Buffer.byteLength('bHNrZGZsa3NqZmtsc2xrZmFqc2RsZmtqcw==',
+    'base64'), 25);
+// special padding
+assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
+assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);
+
+assert.equal(Buffer.byteLength('Il était tué'), 14);
+assert.equal(Buffer.byteLength('Il était tué', 'utf8'), 14);
+assert.equal(Buffer.byteLength('Il était tué', 'ascii'), 12);
+assert.equal(Buffer.byteLength('Il était tué', 'binary'), 12);
+['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
+  assert.equal(24, Buffer.byteLength('Il était tué', encoding));
+});
diff --git a/test/parallel/test-buffer.js b/test/parallel/test-buffer.js
index 53411933a00f9d..1d02148734e38a 100644
--- a/test/parallel/test-buffer.js
+++ b/test/parallel/test-buffer.js
@@ -561,15 +561,6 @@ assert.equal(sb, s);
 b = new Buffer('abcde');
 assert.equal('bcde', b.slice(1).toString());
 
-// byte length
-assert.equal(14, Buffer.byteLength('Il était tué'));
-assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
-['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
-  assert.equal(24, Buffer.byteLength('Il était tué', encoding));
-});
-assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
-assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));
-
 // slice(0,0).length === 0
 assert.equal(0, Buffer('hello').slice(0, 0).length);
 
@@ -1074,10 +1065,6 @@ assert.equal(buf.readInt8(0), -1);
   assert.ok(typeof Buffer(5).slice(0, 5).parent === 'object');
 })();
 
-// Make sure byteLength properly checks for base64 padding
-assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
-assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);
-
 // Regression test for #5482: should throw but not assert in C++ land.
 assert.throws(function() {
   Buffer('', 'buffer');