From 9da168b71fb729635ad71e839630480e815623d0 Mon Sep 17 00:00:00 2001 From: Brendan Ashworth Date: Fri, 15 May 2015 19:24:34 -0700 Subject: [PATCH] buffer: optimize Buffer.byteLength Buffer.byteLength is important for speed because it is called whenever a new Buffer is created from a string. This commit optimizes Buffer.byteLength execution by: - moving base64 length calculation into JS-land, which is now much faster - remove redundant code and streamline the UTF8 length calculation It also adds a benchmark and better tests. PR-URL: https://github.com/nodejs/io.js/pull/1713 Reviewed-By: Trevor Norris Reviewed-By: Ben Noordhuis --- benchmark/buffers/buffer-bytelength.js | 55 +++++++++++++++++++++ lib/buffer.js | 66 ++++++++++++++++++------- src/node_buffer.cc | 16 ++---- test/parallel/test-buffer-bytelength.js | 46 +++++++++++++++++ test/parallel/test-buffer.js | 13 ----- 5 files changed, 155 insertions(+), 41 deletions(-) create mode 100644 benchmark/buffers/buffer-bytelength.js create mode 100644 test/parallel/test-buffer-bytelength.js diff --git a/benchmark/buffers/buffer-bytelength.js b/benchmark/buffers/buffer-bytelength.js new file mode 100644 index 00000000000000..6a7afe6921aea1 --- /dev/null +++ b/benchmark/buffers/buffer-bytelength.js @@ -0,0 +1,55 @@ +var common = require('../common'); + +var bench = common.createBenchmark(main, { + encoding: ['utf8', 'base64'], + len: [1, 2, 4, 16, 64, 256], // x16 + n: [5e6] +}); + +// 16 chars each +var chars = [ + 'hello brendan!!!', // 1 byte + 'ΰαβγδεζηθικλμνξο', // 2 bytes + '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes + '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes +]; + +function main(conf) { + var n = conf.n | 0; + var len = conf.len | 0; + var encoding = conf.encoding; + + var strings = []; + for (var string of chars) { + // Strings must be built differently, depending on encoding + var data = buildString(string, len); + if (encoding === 'utf8') { + strings.push(data); + } else if (encoding === 'base64') { + // Base64 strings will be much longer than their UTF8 counterparts + strings.push(new Buffer(data, 'utf8').toString('base64')); + } + } + + // Check the result to ensure it is *properly* optimized + var results = strings.map(function(val) { + return Buffer.byteLength(val, encoding); + }); + + bench.start(); + for (var i = 0; i < n; i++) { + var index = n % strings.length; + // Go! + var r = Buffer.byteLength(strings[index], encoding); + + if (r !== results[index]) + throw Error('incorrect return value'); + } + bench.end(n); +} + +function buildString(str, times) { + if (times == 1) return str; + + return str + buildString(str, times - 1); +} diff --git a/lib/buffer.js b/lib/buffer.js index dc2b656d7a7a33..1b9c68465d6b03 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -272,30 +272,62 @@ Buffer.concat = function(list, length) { }; +function base64ByteLength(str, bytes) { + // Handle padding + if (str.charCodeAt(bytes - 1) === 0x3D) + bytes--; + if (bytes > 1 && str.charCodeAt(bytes - 1) === 0x3D) + bytes--; + + // Base64 ratio: 3/4 + return (bytes * 3) >>> 2; +} + + function byteLength(string, encoding) { - if (typeof(string) !== 'string') - string = String(string); + if (typeof string !== 'string') + string = '' + string; - if (string.length === 0) + var len = string.length; + if (len === 0) return 0; - switch (encoding) { - case 'ascii': - case 'binary': - case 'raw': - return string.length; + // Use a for loop to avoid recursion + var loweredCase = false; + for (;;) { + switch (encoding) { + case 'ascii': + case 'binary': + // Deprecated + case 'raw': + case 'raws': + return len; - case 'ucs2': - case 'ucs-2': - case 'utf16le': - case 'utf-16le': - return string.length * 2; + case 'utf8': + case 'utf-8': + return binding.byteLengthUtf8(string); - case 'hex': - return string.length >>> 1; - } + case 'ucs2': + case 'ucs-2': + case 'utf16le': + case 'utf-16le': + return len * 2; + + case 'hex': + return len >>> 1; - return binding.byteLength(string, encoding); + case 'base64': + return base64ByteLength(string, len); + + default: + // The C++ binding defaulted to UTF8, we should too. + if (loweredCase) + return binding.byteLengthUtf8(string); + + encoding = ('' + encoding).toLowerCase(); + loweredCase = true; + } + } } Buffer.byteLength = byteLength; diff --git a/src/node_buffer.cc b/src/node_buffer.cc index bd02279212583a..fca08599e50feb 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -541,17 +541,11 @@ void WriteDoubleBE(const FunctionCallbackInfo& args) { } -void ByteLength(const FunctionCallbackInfo &args) { - Environment* env = Environment::GetCurrent(args); - - if (!args[0]->IsString()) - return env->ThrowTypeError("Argument must be a string"); - - Local s = args[0]->ToString(env->isolate()); - enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8); +void ByteLengthUtf8(const FunctionCallbackInfo &args) { + CHECK(args[0]->IsString()); - uint32_t size = StringBytes::Size(env->isolate(), s, e); - args.GetReturnValue().Set(size); + // Fast case: avoid StringBytes on UTF8 string. Jump to v8. + args.GetReturnValue().Set(args[0].As()->Utf8Length()); } @@ -745,7 +739,7 @@ void Initialize(Handle target, env->SetMethod(target, "setupBufferJS", SetupBufferJS); - env->SetMethod(target, "byteLength", ByteLength); + env->SetMethod(target, "byteLengthUtf8", ByteLengthUtf8); env->SetMethod(target, "compare", Compare); env->SetMethod(target, "fill", Fill); env->SetMethod(target, "indexOfBuffer", IndexOfBuffer); diff --git a/test/parallel/test-buffer-bytelength.js b/test/parallel/test-buffer-bytelength.js new file mode 100644 index 00000000000000..c136c62808e8fc --- /dev/null +++ b/test/parallel/test-buffer-bytelength.js @@ -0,0 +1,46 @@ +'use strict'; + +var common = require('../common'); +var assert = require('assert'); +var Buffer = require('buffer').Buffer; + +// coerce values to string +assert.equal(Buffer.byteLength(32, 'raw'), 2); +assert.equal(Buffer.byteLength(NaN, 'utf8'), 3); +assert.equal(Buffer.byteLength({}, 'raws'), 15); +assert.equal(Buffer.byteLength(), 9); + +// special case: zero length string +assert.equal(Buffer.byteLength('', 'ascii'), 0); +assert.equal(Buffer.byteLength('', 'HeX'), 0); + +// utf8 +assert.equal(Buffer.byteLength('∑éllö wørl∂!', 'utf-8'), 19); +assert.equal(Buffer.byteLength('κλμνξο', 'utf8'), 12); +assert.equal(Buffer.byteLength('挵挶挷挸挹', 'utf-8'), 15); +assert.equal(Buffer.byteLength('𠝹𠱓𠱸', 'UTF8'), 12); +// without an encoding, utf8 should be assumed +assert.equal(Buffer.byteLength('hey there'), 9); +assert.equal(Buffer.byteLength('𠱸挶νξ#xx :)'), 17); +assert.equal(Buffer.byteLength('hello world', ''), 11); +// it should also be assumed with unrecognized encoding +assert.equal(Buffer.byteLength('hello world', 'abc'), 11); +assert.equal(Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'), 10); + +// base64 +assert.equal(Buffer.byteLength('aGVsbG8gd29ybGQ=', 'base64'), 11); +assert.equal(Buffer.byteLength('bm9kZS5qcyByb2NrcyE=', 'base64'), 14); +assert.equal(Buffer.byteLength('aGkk', 'base64'), 3); +assert.equal(Buffer.byteLength('bHNrZGZsa3NqZmtsc2xrZmFqc2RsZmtqcw==', + 'base64'), 25); +// special padding +assert.equal(Buffer.byteLength('aaa=', 'base64'), 2); +assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3); + +assert.equal(Buffer.byteLength('Il était tué'), 14); +assert.equal(Buffer.byteLength('Il était tué', 'utf8'), 14); +assert.equal(Buffer.byteLength('Il était tué', 'ascii'), 12); +assert.equal(Buffer.byteLength('Il était tué', 'binary'), 12); +['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) { + assert.equal(24, Buffer.byteLength('Il était tué', encoding)); +}); diff --git a/test/parallel/test-buffer.js b/test/parallel/test-buffer.js index 53411933a00f9d..1d02148734e38a 100644 --- a/test/parallel/test-buffer.js +++ b/test/parallel/test-buffer.js @@ -561,15 +561,6 @@ assert.equal(sb, s); b = new Buffer('abcde'); assert.equal('bcde', b.slice(1).toString()); -// byte length -assert.equal(14, Buffer.byteLength('Il était tué')); -assert.equal(14, Buffer.byteLength('Il était tué', 'utf8')); -['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) { - assert.equal(24, Buffer.byteLength('Il était tué', encoding)); -}); -assert.equal(12, Buffer.byteLength('Il était tué', 'ascii')); -assert.equal(12, Buffer.byteLength('Il était tué', 'binary')); - // slice(0,0).length === 0 assert.equal(0, Buffer('hello').slice(0, 0).length); @@ -1074,10 +1065,6 @@ assert.equal(buf.readInt8(0), -1); assert.ok(typeof Buffer(5).slice(0, 5).parent === 'object'); })(); -// Make sure byteLength properly checks for base64 padding -assert.equal(Buffer.byteLength('aaa=', 'base64'), 2); -assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3); - // Regression test for #5482: should throw but not assert in C++ land. assert.throws(function() { Buffer('', 'buffer');