From 62ba6c89721ee1d354c4b864e180004d5508729b Mon Sep 17 00:00:00 2001
From: Zach Bjornson <zbbjornson@gmail.com>
Date: Wed, 21 Sep 2016 18:02:45 -0700
Subject: [PATCH 1/2] src: fix build for older clang

Removes use of builtins that are unavailable for older clang. Per
benchmarks, only uses builtins on Windows, where speedup is
significant.

Fixes: https://github.com/nodejs/node/issues/7618
---
 src/node_buffer.cc  | 91 ++----------------------------------------
 src/string_bytes.cc | 34 +++++-----------
 src/util-inl.h      | 97 +++++++++++++++++++++++++++++++++++++++++++--
 src/util.h          |  6 ++-
 4 files changed, 113 insertions(+), 115 deletions(-)

diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 467a6e88474b39..d7c27295e94129 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -12,7 +12,6 @@
 
 #include <string.h>
 #include <limits.h>
-#include <utility>
 
 #define BUFFER_ID 0xB0E4
 
@@ -49,38 +48,6 @@
   THROW_AND_RETURN_IF_OOB(end <= end_max);                                  \
   size_t length = end - start;
 
-#if defined(__GNUC__) || defined(__clang__)
-#define BSWAP_INTRINSIC_2(x) __builtin_bswap16(x)
-#define BSWAP_INTRINSIC_4(x) __builtin_bswap32(x)
-#define BSWAP_INTRINSIC_8(x) __builtin_bswap64(x)
-#elif defined(__linux__)
-#include <byteswap.h>
-#define BSWAP_INTRINSIC_2(x) bswap_16(x)
-#define BSWAP_INTRINSIC_4(x) bswap_32(x)
-#define BSWAP_INTRINSIC_8(x) bswap_64(x)
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#define BSWAP_INTRINSIC_2(x) _byteswap_ushort(x);
-#define BSWAP_INTRINSIC_4(x) _byteswap_ulong(x);
-#define BSWAP_INTRINSIC_8(x) _byteswap_uint64(x);
-#else
-#define BSWAP_INTRINSIC_2(x) ((x) << 8) | ((x) >> 8)
-#define BSWAP_INTRINSIC_4(x)                                                  \
-  (((x) & 0xFF) << 24) |                                                      \
-  (((x) & 0xFF00) << 8) |                                                     \
-  (((x) >> 8) & 0xFF00) |                                                     \
-  (((x) >> 24) & 0xFF)
-#define BSWAP_INTRINSIC_8(x)                                                  \
-  (((x) & 0xFF00000000000000ull) >> 56) |                                     \
-  (((x) & 0x00FF000000000000ull) >> 40) |                                     \
-  (((x) & 0x0000FF0000000000ull) >> 24) |                                     \
-  (((x) & 0x000000FF00000000ull) >> 8) |                                      \
-  (((x) & 0x00000000FF000000ull) << 8) |                                      \
-  (((x) & 0x0000000000FF0000ull) << 24) |                                     \
-  (((x) & 0x000000000000FF00ull) << 40) |                                     \
-  (((x) & 0x00000000000000FFull) << 56)
-#endif
-
 namespace node {
 
 // if true, all Buffer and SlowBuffer instances will automatically zero-fill
@@ -1204,23 +1171,7 @@ void Swap16(const FunctionCallbackInfo<Value>& args) {
   Environment* env = Environment::GetCurrent(args);
   THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]);
   SPREAD_ARG(args[0], ts_obj);
-
-  CHECK_EQ(ts_obj_length % 2, 0);
-
-  int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint16_t);
-
-  if (align == 0) {
-    uint16_t* data16 = reinterpret_cast<uint16_t*>(ts_obj_data);
-    size_t len16 = ts_obj_length / 2;
-    for (size_t i = 0; i < len16; i++) {
-      data16[i] = BSWAP_INTRINSIC_2(data16[i]);
-    }
-  } else {
-    for (size_t i = 0; i < ts_obj_length; i += 2) {
-      std::swap(ts_obj_data[i], ts_obj_data[i + 1]);
-    }
-  }
-
+  SwapBytes16(ts_obj_data, ts_obj_length);
   args.GetReturnValue().Set(args[0]);
 }
 
@@ -1229,24 +1180,7 @@ void Swap32(const FunctionCallbackInfo<Value>& args) {
   Environment* env = Environment::GetCurrent(args);
   THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]);
   SPREAD_ARG(args[0], ts_obj);
-
-  CHECK_EQ(ts_obj_length % 4, 0);
-
-  int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint32_t);
-
-  if (align == 0) {
-    uint32_t* data32 = reinterpret_cast<uint32_t*>(ts_obj_data);
-    size_t len32 = ts_obj_length / 4;
-    for (size_t i = 0; i < len32; i++) {
-      data32[i] = BSWAP_INTRINSIC_4(data32[i]);
-    }
-  } else {
-    for (size_t i = 0; i < ts_obj_length; i += 4) {
-      std::swap(ts_obj_data[i], ts_obj_data[i + 3]);
-      std::swap(ts_obj_data[i + 1], ts_obj_data[i + 2]);
-    }
-  }
-
+  SwapBytes32(ts_obj_data, ts_obj_length);
   args.GetReturnValue().Set(args[0]);
 }
 
@@ -1255,26 +1189,7 @@ void Swap64(const FunctionCallbackInfo<Value>& args) {
   Environment* env = Environment::GetCurrent(args);
   THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]);
   SPREAD_ARG(args[0], ts_obj);
-
-  CHECK_EQ(ts_obj_length % 8, 0);
-
-  int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint64_t);
-
-  if (align == 0) {
-    uint64_t* data64 = reinterpret_cast<uint64_t*>(ts_obj_data);
-    size_t len32 = ts_obj_length / 8;
-    for (size_t i = 0; i < len32; i++) {
-      data64[i] = BSWAP_INTRINSIC_8(data64[i]);
-    }
-  } else {
-    for (size_t i = 0; i < ts_obj_length; i += 8) {
-      std::swap(ts_obj_data[i], ts_obj_data[i + 7]);
-      std::swap(ts_obj_data[i + 1], ts_obj_data[i + 6]);
-      std::swap(ts_obj_data[i + 2], ts_obj_data[i + 5]);
-      std::swap(ts_obj_data[i + 3], ts_obj_data[i + 4]);
-    }
-  }
-
+  SwapBytes64(ts_obj_data, ts_obj_length);
   args.GetReturnValue().Set(args[0]);
 }
 
diff --git a/src/string_bytes.cc b/src/string_bytes.cc
index d9e8b97114e2cd..882ca6e3e89bd3 100644
--- a/src/string_bytes.cc
+++ b/src/string_bytes.cc
@@ -309,27 +309,13 @@ size_t StringBytes::Write(Isolate* isolate,
       if (chars_written != nullptr)
         *chars_written = nchars;
 
-      if (!IsBigEndian())
-        break;
-
       // Node's "ucs2" encoding wants LE character data stored in
       // the Buffer, so we need to reorder on BE platforms.  See
       // http://nodejs.org/api/buffer.html regarding Node's "ucs2"
       // encoding specification
+      if (IsBigEndian())
+        SwapBytes16(buf, nbytes);
 
-      const bool is_aligned =
-          reinterpret_cast<uintptr_t>(buf) % sizeof(uint16_t);
-      if (is_aligned) {
-        uint16_t* const dst = reinterpret_cast<uint16_t*>(buf);
-        SwapBytes(dst, dst, nchars);
-      }
-
-      ASSERT_EQ(sizeof(uint16_t), 2);
-      for (size_t i = 0; i < nchars; i++) {
-        char tmp = buf[i * 2];
-        buf[i * 2] = buf[i * 2 + 1];
-        buf[i * 2 + 1] = tmp;
-      }
       break;
     }
 
@@ -705,17 +691,19 @@ Local<Value> StringBytes::Encode(Isolate* isolate,
 Local<Value> StringBytes::Encode(Isolate* isolate,
                                  const uint16_t* buf,
                                  size_t buflen) {
-  Local<String> val;
+  // Node's "ucs2" encoding expects LE character data inside a
+  // Buffer, so we need to reorder on BE platforms.  See
+  // http://nodejs.org/api/buffer.html regarding Node's "ucs2"
+  // encoding specification
   std::vector<uint16_t> dst;
   if (IsBigEndian()) {
-    // Node's "ucs2" encoding expects LE character data inside a
-    // Buffer, so we need to reorder on BE platforms.  See
-    // http://nodejs.org/api/buffer.html regarding Node's "ucs2"
-    // encoding specification
-    dst.resize(buflen);
-    SwapBytes(&dst[0], buf, buflen);
+    dst.assign(buf, buf + buflen);
+    size_t nbytes = buflen * sizeof(dst[0]);
+    SwapBytes16(reinterpret_cast<char*>(&dst[0]), nbytes);
     buf = &dst[0];
   }
+
+  Local<String> val;
   if (buflen < EXTERN_APEX) {
     val = String::NewFromTwoByte(isolate,
                                  buf,
diff --git a/src/util-inl.h b/src/util-inl.h
index 51adb816926e52..5ffe5b857f5381 100644
--- a/src/util-inl.h
+++ b/src/util-inl.h
@@ -4,6 +4,30 @@
 #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
 
 #include "util.h"
+#include <cstring>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#define BSWAP_2(x) _byteswap_ushort(x)
+#define BSWAP_4(x) _byteswap_ulong(x)
+#define BSWAP_8(x) _byteswap_uint64(x)
+#else
+#define BSWAP_2(x) ((x) << 8) | ((x) >> 8)
+#define BSWAP_4(x)                                                            \
+  (((x) & 0xFF) << 24) |                                                      \
+  (((x) & 0xFF00) << 8) |                                                     \
+  (((x) >> 8) & 0xFF00) |                                                     \
+  (((x) >> 24) & 0xFF)
+#define BSWAP_8(x)                                                            \
+  (((x) & 0xFF00000000000000ull) >> 56) |                                     \
+  (((x) & 0x00FF000000000000ull) >> 40) |                                     \
+  (((x) & 0x0000FF0000000000ull) >> 24) |                                     \
+  (((x) & 0x000000FF00000000ull) >> 8) |                                      \
+  (((x) & 0x00000000FF000000ull) << 8) |                                      \
+  (((x) & 0x0000000000FF0000ull) << 24) |                                     \
+  (((x) & 0x000000000000FF00ull) << 40) |                                     \
+  (((x) & 0x00000000000000FFull) << 56)
+#endif
 
 namespace node {
 
@@ -200,9 +224,76 @@ TypeName* Unwrap(v8::Local<v8::Object> object) {
   return static_cast<TypeName*>(pointer);
 }
 
-void SwapBytes(uint16_t* dst, const uint16_t* src, size_t buflen) {
-  for (size_t i = 0; i < buflen; i += 1)
-    dst[i] = (src[i] << 8) | (src[i] >> 8);
+void SwapBytes16(char* data, size_t nbytes) {
+  CHECK_EQ(nbytes % 2, 0);
+
+#if defined(_MSC_VER)
+  int align = reinterpret_cast<uintptr_t>(data) % sizeof(uint16_t);
+  if (align == 0) {
+    // MSVC has no strict aliasing, and is able to highly optimize this case.
+    uint16_t* data16 = reinterpret_cast<uint16_t*>(data);
+    size_t len16 = nbytes / sizeof(*data16);
+    for (size_t i = 0; i < len16; i++) {
+      data16[i] = BSWAP_2(data16[i]);
+    }
+    return;
+  }
+#endif
+
+  uint16_t temp;
+  for (size_t i = 0; i < nbytes; i += sizeof(temp)) {
+    memcpy(&temp, &data[i], sizeof(temp));
+    temp = BSWAP_2(temp);
+    memcpy(&data[i], &temp, sizeof(temp));
+  }
+}
+
+void SwapBytes32(char* data, size_t nbytes) {
+  CHECK_EQ(nbytes % 4, 0);
+
+#if defined(_MSC_VER)
+  int align = reinterpret_cast<uintptr_t>(data) % sizeof(uint32_t);
+  // MSVC has no strict aliasing, and is able to highly optimize this case.
+  if (align == 0) {
+    uint32_t* data32 = reinterpret_cast<uint32_t*>(data);
+    size_t len32 = nbytes / sizeof(*data32);
+    for (size_t i = 0; i < len32; i++) {
+      data32[i] = BSWAP_4(data32[i]);
+    }
+    return;
+  }
+#endif
+
+  uint32_t temp;
+  for (size_t i = 0; i < nbytes; i += sizeof(temp)) {
+    memcpy(&temp, &data[i], sizeof(temp));
+    temp = BSWAP_4(temp);
+    memcpy(&data[i], &temp, sizeof(temp));
+  }
+}
+
+void SwapBytes64(char* data, size_t nbytes) {
+  CHECK_EQ(nbytes % 8, 0);
+
+#if defined(_MSC_VER)
+  int align = reinterpret_cast<uintptr_t>(data) % sizeof(uint64_t);
+  if (align == 0) {
+    // MSVC has no strict aliasing, and is able to highly optimize this case.
+    uint64_t* data64 = reinterpret_cast<uint64_t*>(data);
+    size_t len64 = nbytes / sizeof(*data64);
+    for (size_t i = 0; i < len64; i++) {
+      data64[i] = BSWAP_8(data64[i]);
+    }
+    return;
+  }
+#endif
+
+  uint64_t temp;
+  for (size_t i = 0; i < nbytes; i += sizeof(temp)) {
+    memcpy(&temp, &data[i], sizeof(temp));
+    temp = BSWAP_8(temp);
+    memcpy(&data[i], &temp, sizeof(temp));
+  }
 }
 
 char ToLower(char c) {
diff --git a/src/util.h b/src/util.h
index 25f2eb01783144..e2f9df02bc4361 100644
--- a/src/util.h
+++ b/src/util.h
@@ -254,7 +254,11 @@ inline void ClearWrap(v8::Local<v8::Object> object);
 template <typename TypeName>
 inline TypeName* Unwrap(v8::Local<v8::Object> object);
 
-inline void SwapBytes(uint16_t* dst, const uint16_t* src, size_t buflen);
+// Swaps bytes in place. nbytes is the number of bytes to swap and must be a
+// multiple of the word size (checked by function).
+inline void SwapBytes16(char* data, size_t nbytes);
+inline void SwapBytes32(char* data, size_t nbytes);
+inline void SwapBytes64(char* data, size_t nbytes);
 
 // tolower() is locale-sensitive.  Use ToLower() instead.
 inline char ToLower(char c);

From 2b69933796a1b5f1c59cd206b75e3616624fea2c Mon Sep 17 00:00:00 2001
From: Zach Bjornson <zbbjornson@gmail.com>
Date: Sat, 24 Sep 2016 19:17:45 -0400
Subject: [PATCH 2/2] test: add test for unaligned ucs2 buffer write

Between #3410 and #7645, bytes were swapped twice on bigendian
platforms if buffer was not two-byte aligned. See comment in #7645.
---
 test/parallel/test-buffer-alloc.js | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/parallel/test-buffer-alloc.js b/test/parallel/test-buffer-alloc.js
index 38e3f4763c715f..760ad2e37c05b7 100644
--- a/test/parallel/test-buffer-alloc.js
+++ b/test/parallel/test-buffer-alloc.js
@@ -585,6 +585,12 @@ assert.strictEqual('<Buffer 81 a3 66 6f 6f a3 62 61 72>', x.inspect());
   assert.strictEqual(b.toString(encoding), 'あいうえお');
 });
 
+['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach((encoding) => {
+  const b = Buffer.allocUnsafe(11);
+  b.write('あいうえお', 1, encoding);
+  assert.strictEqual(b.toString(encoding, 1), 'あいうえお');
+});
+
 {
   // latin1 encoding should write only one byte per character.
   const b = Buffer.from([0xde, 0xad, 0xbe, 0xef]);