From c5025508ca887e7a75e2995946f1777d0f8c38c7 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 20 Aug 2024 09:04:30 -0400 Subject: [PATCH] src: improve `buffer.transcode` performance PR-URL: https://github.com/nodejs/node/pull/54153 Reviewed-By: Daniel Lemire Reviewed-By: Benjamin Gruenbaum Reviewed-By: Matteo Collina Reviewed-By: Minwoo Jung Reviewed-By: James M Snell --- benchmark/buffers/buffer-transcode.js | 35 +++++++++ src/node_i18n.cc | 107 +++++++++++--------------- 2 files changed, 79 insertions(+), 63 deletions(-) create mode 100644 benchmark/buffers/buffer-transcode.js diff --git a/benchmark/buffers/buffer-transcode.js b/benchmark/buffers/buffer-transcode.js new file mode 100644 index 00000000000000..cbb3b2e9b16374 --- /dev/null +++ b/benchmark/buffers/buffer-transcode.js @@ -0,0 +1,35 @@ +'use strict'; +const common = require('../common.js'); +const assert = require('node:assert'); +const buffer = require('node:buffer'); + +const hasIntl = !!process.config.variables.v8_enable_i18n_support; +const encodings = ['latin1', 'ascii', 'ucs2', 'utf8']; + +if (!hasIntl) { + console.log('Skipping: `transcode` is only available on platforms that support i18n`'); + process.exit(0); +} + +const bench = common.createBenchmark(main, { + fromEncoding: encodings, + toEncoding: encodings, + length: [1, 10, 1000], + n: [1e5], +}, { + combinationFilter(p) { + return !(p.fromEncoding === 'ucs2' && p.toEncoding === 'utf8'); + }, +}); + +function main({ n, fromEncoding, toEncoding, length }) { + const input = Buffer.from('a'.repeat(length)); + let out = 0; + bench.start(); + for (let i = 0; i < n; i++) { + const dest = buffer.transcode(input, fromEncoding, toEncoding); + out += dest.buffer.byteLength; + } + bench.end(n); + assert.ok(out >= 0); +} diff --git a/src/node_i18n.cc b/src/node_i18n.cc index d45325954d9807..2aa7cd98ecc179 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -42,6 +42,7 @@ #include "node_i18n.h" #include "node_external_reference.h" +#include "simdutf.h" #if defined(NODE_HAVE_I18N_SUPPORT) @@ -146,7 +147,6 @@ MaybeLocal Transcode(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer result; Converter to(toEncoding); @@ -169,22 +169,21 @@ MaybeLocal Transcode(Environment* env, return ret; } -MaybeLocal TranscodeToUcs2(Environment* env, - const char* fromEncoding, - const char* toEncoding, - const char* source, - const size_t source_length, - UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeLocal ret; +MaybeLocal TranscodeLatin1ToUcs2(Environment* env, + const char* fromEncoding, + const char* toEncoding, + const char* source, + const size_t source_length, + UErrorCode* status) { MaybeStackBuffer destbuf(source_length); - Converter from(fromEncoding); - const size_t length_in_chars = source_length * sizeof(UChar); - ucnv_toUChars(from.conv(), *destbuf, length_in_chars, - source, source_length, status); - if (U_SUCCESS(*status)) - ret = ToBufferEndian(env, &destbuf); - return ret; + auto actual_length = + simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out()); + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; + } + + return Buffer::New(env, &destbuf); } MaybeLocal TranscodeFromUcs2(Environment* env, @@ -193,13 +192,11 @@ MaybeLocal TranscodeFromUcs2(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; MaybeStackBuffer sourcebuf; MaybeLocal ret; Converter to(toEncoding); - size_t sublen = ucnv_getMinCharSize(to.conv()); - std::string sub(sublen, '?'); + std::string sub(to.min_char_size(), '?'); to.set_subst_chars(sub.c_str()); const size_t length_in_chars = source_length / sizeof(UChar); @@ -220,26 +217,18 @@ MaybeLocal TranscodeUcs2FromUtf8(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeStackBuffer destbuf; - int32_t result_length; - u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, - source, source_length, status); - MaybeLocal ret; - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } else if (*status == U_BUFFER_OVERFLOW_ERROR) { - *status = U_ZERO_ERROR; - destbuf.AllocateSufficientStorage(result_length); - u_strFromUTF8(*destbuf, result_length, &result_length, - source, source_length, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } + size_t expected_utf16_length = + simdutf::utf16_length_from_utf8(source, source_length); + MaybeStackBuffer destbuf(expected_utf16_length); + auto actual_length = + simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out()); + + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; } - return ret; + + return Buffer::New(env, &destbuf); } MaybeLocal TranscodeUtf8FromUcs2(Environment* env, @@ -248,32 +237,25 @@ MaybeLocal TranscodeUtf8FromUcs2(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeLocal ret; const size_t length_in_chars = source_length / sizeof(UChar); - int32_t result_length; - MaybeStackBuffer sourcebuf; - MaybeStackBuffer destbuf; - CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); - u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, - *sourcebuf, length_in_chars, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } else if (*status == U_BUFFER_OVERFLOW_ERROR) { - *status = U_ZERO_ERROR; - destbuf.AllocateSufficientStorage(result_length); - u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, - length_in_chars, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } + size_t expected_utf8_length = simdutf::utf8_length_from_utf16le( + reinterpret_cast(source), length_in_chars); + + MaybeStackBuffer destbuf(expected_utf8_length); + auto actual_length = simdutf::convert_utf16le_to_utf8( + reinterpret_cast(source), + length_in_chars, + destbuf.out()); + + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; } - return ret; + + return Buffer::New(env, &destbuf); } -const char* EncodingName(const enum encoding encoding) { +constexpr const char* EncodingName(const enum encoding encoding) { switch (encoding) { case ASCII: return "us-ascii"; case LATIN1: return "iso8859-1"; @@ -283,7 +265,7 @@ const char* EncodingName(const enum encoding encoding) { } } -bool SupportedEncoding(const enum encoding encoding) { +constexpr bool SupportedEncoding(const enum encoding encoding) { switch (encoding) { case ASCII: case LATIN1: @@ -308,8 +290,7 @@ void Transcode(const FunctionCallbackInfo&args) { switch (fromEncoding) { case ASCII: case LATIN1: - if (toEncoding == UCS2) - tfn = &TranscodeToUcs2; + if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2; break; case UTF8: if (toEncoding == UCS2)