nodejs · nodejs-github-bot · Feb 17, 2024 · Jan 29, 2024
diff --git a/node.gyp b/node.gyp
@@ -73,6 +73,7 @@
       'src/connection_wrap.cc',
       'src/dataqueue/queue.cc',
       'src/debug_utils.cc',
+      'src/embedded_data.cc',
       'src/encoding_binding.cc',
       'src/env.cc',
       'src/fs_event_wrap.cc',
@@ -193,6 +194,7 @@
       'src/dataqueue/queue.h',
       'src/debug_utils.h',
       'src/debug_utils-inl.h',
+      'src/embeded_data.h',
       'src/encoding_binding.h',
       'src/env_properties.h',
       'src/env.h',
@@ -1214,11 +1216,14 @@
         'deps/simdutf/simdutf.gyp:simdutf#host',
       ],
       'include_dirs': [
-        'tools'
+        'tools',
+        'src',
       ],
       'sources': [
         'tools/js2c.cc',
-        'tools/executable_wrapper.h'
+        'tools/executable_wrapper.h',
+        'src/embedded_data.h',
+        'src/embedded_data.cc',
       ],
       'conditions': [
         [ 'node_shared_libuv=="false"', {

diff --git a/src/embedded_data.cc b/src/embedded_data.cc
@@ -0,0 +1,33 @@
+#include "embedded_data.h"
+#include <vector>
+
+namespace node {
+std::string ToOctalString(const uint8_t ch) {
+  // We can print most printable characters directly. The exceptions are '\'
+  // (escape characters), " (would end the string), and ? (trigraphs). The
+  // latter may be overly conservative: we compile with C++17 which doesn't
+  // support trigraphs.
+  if (ch >= ' ' && ch <= '~' && ch != '\\' && ch != '"' && ch != '?') {
+    return std::string(1, static_cast<char>(ch));
+  }
+  // All other characters are blindly output as octal.
+  const char c0 = '0' + ((ch >> 6) & 7);
+  const char c1 = '0' + ((ch >> 3) & 7);
+  const char c2 = '0' + (ch & 7);
+  return std::string("\\") + c0 + c1 + c2;
+}
+
+std::vector<std::string> GetOctalTable() {
+  size_t size = 1 << 8;
+  std::vector<std::string> code_table(size);
+  for (size_t i = 0; i < size; ++i) {
+    code_table[i] = ToOctalString(static_cast<uint8_t>(i));
+  }
+  return code_table;
+}
+
+const std::string& GetOctalCode(uint8_t index) {
+  static std::vector<std::string> table = GetOctalTable();
+  return table[index];
+}
+}  // namespace node
diff --git a/src/embedded_data.h b/src/embedded_data.h
@@ -0,0 +1,17 @@
+#ifndef SRC_EMBEDDED_DATA_H_
+#define SRC_EMBEDDED_DATA_H_
+
+#include <cinttypes>
+#include <string>
+
+// This file must not depend on node.h or other code that depends on
+// the full Node.js implementation because it is used during the
+// compilation of the Node.js implementation itself (especially js2c).
+
+namespace node {
+
+const std::string& GetOctalCode(uint8_t index);
+
+}  // namespace node
+
+#endif  // SRC_EMBEDDED_DATA_H_
diff --git a/src/node_snapshotable.cc b/src/node_snapshotable.cc
@@ -8,6 +8,7 @@
 #include "base_object-inl.h"
 #include "blob_serializer_deserializer-inl.h"
 #include "debug_utils-inl.h"
+#include "embedded_data.h"
 #include "encoding_binding.h"
 #include "env-inl.h"
 #include "json_parser.h"
@@ -748,35 +749,6 @@ static std::string FormatSize(size_t size) {
   return buf;
 }
 
-std::string ToOctalString(const uint8_t ch) {
-  // We can print most printable characters directly. The exceptions are '\'
-  // (escape characters), " (would end the string), and ? (trigraphs). The
-  // latter may be overly conservative: we compile with C++17 which doesn't
-  // support trigraphs.
-  if (ch >= ' ' && ch <= '~' && ch != '\\' && ch != '"' && ch != '?') {
-    return std::string(1, static_cast<char>(ch));
-  }
-  // All other characters are blindly output as octal.
-  const char c0 = '0' + ((ch >> 6) & 7);
-  const char c1 = '0' + ((ch >> 3) & 7);
-  const char c2 = '0' + (ch & 7);
-  return std::string("\\") + c0 + c1 + c2;
-}
-
-std::vector<std::string> GetOctalTable() {
-  size_t size = 1 << 8;
-  std::vector<std::string> code_table(size);
-  for (size_t i = 0; i < size; ++i) {
-    code_table[i] = ToOctalString(static_cast<uint8_t>(i));
-  }
-  return code_table;
-}
-
-const std::string& GetOctalCode(uint8_t index) {
-  static std::vector<std::string> table = GetOctalTable();
-  return table[index];
-}
-
 template <typename T>
 void WriteByteVectorLiteral(std::ostream* ss,
                             const T* vec,

diff --git a/tools/js2c.cc b/tools/js2c.cc
@@ -11,6 +11,7 @@
 #include <string>
 #include <string_view>
 #include <vector>
+#include "embedded_data.h"
 #include "executable_wrapper.h"
 #include "simdutf.h"
 #include "uv.h"
@@ -396,11 +397,14 @@ const std::string& GetCode(uint16_t index) {
 
 #ifdef NODE_JS2C_USE_STRING_LITERALS
 const char* string_literal_def_template = "static const %s *%s_raw = ";
+constexpr std::string_view latin1_string_literal_start =
+    "reinterpret_cast<const uint8_t*>(\"";
 constexpr std::string_view ascii_string_literal_start =
     "reinterpret_cast<const uint8_t*>(R\"JS2C1b732aee(";
 constexpr std::string_view utf16_string_literal_start =
     "reinterpret_cast<const uint16_t*>(uR\"JS2C1b732aee(";
-constexpr std::string_view string_literal_end = ")JS2C1b732aee\");";
+constexpr std::string_view latin1_string_literal_end = "\");";
+constexpr std::string_view utf_string_literal_end = ")JS2C1b732aee\");";
 #else
 const char* array_literal_def_template = "static const %s %s_raw[] = ";
 constexpr std::string_view array_literal_start = "{\n";
@@ -424,9 +428,15 @@ constexpr std::string_view array_literal_end = "\n};\n\n";
 // If NODE_JS2C_USE_STRING_LITERALS is defined, the data is output as C++
 // raw strings (i.e. R"JS2C1b732aee(...)JS2C1b732aee") rather than as an
 // array. This speeds up compilation for gcc/clang.
+enum class CodeType {
+  kAscii,   // Code points are all within 0-127
+  kLatin1,  // Code points are all within 0-255
+  kTwoByte,
+};
 template <typename T>
 Fragment GetDefinitionImpl(const std::vector<char>& code,
-                           const std::string& var) {
+                           const std::string& var,
+                           CodeType type) {
   constexpr bool is_two_byte = std::is_same_v<T, uint16_t>;
   static_assert(is_two_byte || std::is_same_v<T, char>);
 
@@ -440,11 +450,14 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
 
 #ifdef NODE_JS2C_USE_STRING_LITERALS
   const char* literal_def_template = string_literal_def_template;
-  size_t def_size = 512 + code.size();
+  // For code that contains Latin-1 characters, be conservative and assume
+  // they all need escaping: one "\" and three digits.
+  size_t unit = type == CodeType::kLatin1 ? 4 : 1;
+  size_t def_size = 512 + code.size() * unit;
 #else
   const char* literal_def_template = array_literal_def_template;
   constexpr size_t unit =
-      (is_two_byte ? 5 : 3) + 1;  // 0-65536 or 0-127 and a ","
+      (is_two_byte ? 5 : 3) + 1;  // 0-65536 or 0-255 and a ","
   size_t def_size = 512 + count * unit;
 #endif
 
@@ -456,16 +469,56 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
   assert(cur != 0);
 
 #ifdef NODE_JS2C_USE_STRING_LITERALS
-  constexpr std::string_view start_string_view =
-      is_two_byte ? utf16_string_literal_start : ascii_string_literal_start;
+  std::string_view start_string_view;
+  switch (type) {
+    case CodeType::kAscii:
+      start_string_view = ascii_string_literal_start;
+      break;
+    case CodeType::kLatin1:
+      start_string_view = latin1_string_literal_start;
+      break;
+    case CodeType::kTwoByte:
+      start_string_view = utf16_string_literal_start;
+      break;
+  }
 
   memcpy(
       result.data() + cur, start_string_view.data(), start_string_view.size());
   cur += start_string_view.size();
 
-  memcpy(result.data() + cur, code.data(), code.size());
-  cur += code.size();
+  if (type != CodeType::kLatin1) {
+    memcpy(result.data() + cur, code.data(), code.size());
+    cur += code.size();
+  } else {
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(code.data());
+    for (size_t i = 0; i < count; ++i) {
+      // Avoid using snprintf on large chunks of data because it's much slower.
+      // It's fine to use it on small amount of data though.
+      uint8_t ch = ptr[i];
+      if (ch > 127) {
+        Debug("In %s, found non-ASCII Latin-1 character at %zu: %d\n",
+              var.c_str(),
+              i,
+              ch);
+      }
+      const std::string& str = GetOctalCode(ch);
+      memcpy(result.data() + cur, str.c_str(), str.size());
+      cur += str.size();
+    }
+  }
 
+  std::string_view string_literal_end;
+  switch (type) {
+    case CodeType::kAscii:
+      string_literal_end = utf_string_literal_end;
+      break;
+    case CodeType::kLatin1:
+      string_literal_end = latin1_string_literal_end;
+      break;
+    case CodeType::kTwoByte:
+      string_literal_end = utf_string_literal_end;
+      break;
+  }
   memcpy(result.data() + cur,
          string_literal_end.data(),
          string_literal_end.size());
@@ -476,10 +529,10 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
          array_literal_start.size());
   cur += array_literal_start.size();
 
-  const std::vector<T>* codepoints;
-
-  std::vector<uint16_t> utf16_codepoints;
+  // Avoid using snprintf on large chunks of data because it's much slower.
+  // It's fine to use it on small amount of data though.
   if constexpr (is_two_byte) {
+    std::vector<uint16_t> utf16_codepoints;
     utf16_codepoints.resize(count);
     size_t utf16_count = simdutf::convert_utf8_to_utf16(
         code.data(),
@@ -488,19 +541,25 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
     assert(utf16_count != 0);
     utf16_codepoints.resize(utf16_count);
     Debug("static size %zu\n", utf16_count);
-    codepoints = &utf16_codepoints;
+    for (size_t i = 0; i < utf16_count; ++i) {
+      const std::string& str = GetCode(utf16_codepoints[i]);
+      memcpy(result.data() + cur, str.c_str(), str.size());
+      cur += str.size();
+    }
   } else {
-    // The code is ASCII, so no need to translate.
-    codepoints = &code;
-  }
-
-  for (size_t i = 0; i < codepoints->size(); ++i) {
-    // Avoid using snprintf on large chunks of data because it's much slower.
-    // It's fine to use it on small amount of data though.
-    const std::string& str = GetCode(static_cast<uint16_t>((*codepoints)[i]));
-
-    memcpy(result.data() + cur, str.c_str(), str.size());
-    cur += str.size();
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(code.data());
+    for (size_t i = 0; i < count; ++i) {
+      uint16_t ch = static_cast<uint16_t>(ptr[i]);
+      if (ch > 127) {
+        Debug("In %s, found non-ASCII Latin-1 character at %zu: %d\n",
+              var.c_str(),
+              i,
+              ch);
+      }
+      const std::string& str = GetCode(ch);
+      memcpy(result.data() + cur, str.c_str(), str.size());
+      cur += str.size();
+    }
   }
 
   memcpy(
@@ -520,17 +579,80 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
   return result;
 }
 
-Fragment GetDefinition(const std::string& var, const std::vector<char>& code) {
-  Debug("GetDefinition %s, code size %zu ", var.c_str(), code.size());
-  bool is_one_byte = simdutf::validate_ascii(code.data(), code.size());
-  Debug("with %s\n", is_one_byte ? "1-byte chars" : "2-byte chars");
+bool Simplify(const std::vector<char>& code,
+              const std::string& var,
+              std::vector<char>* simplified) {
+  // Allowlist files to avoid false positives.
+  // TODO(joyeecheung): this could be removed if undici updates itself
+  // to replace "’" with "'" though we could still keep this skeleton in
+  // place for future hot fixes that are verified by humans.
+  if (var != "internal_deps_undici_undici") {
+    return false;
+  }
 
-  if (is_one_byte) {
-    Debug("static size %zu\n", code.size());
-    return GetDefinitionImpl<char>(code, var);
-  } else {
-    return GetDefinitionImpl<uint16_t>(code, var);
+  size_t code_size = code.size();
+  simplified->reserve(code_size);
+  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(code.data());
+  size_t simplified_count = 0;
+  for (size_t i = 0; i < code_size; ++i) {
+    switch (ptr[i]) {
+      case 226: {  // ’ [ 226, 128, 153 ] -> '
+        if (i + 2 < code_size && ptr[i + 1] == 128 && ptr[i + 2] == 153) {
+          simplified->push_back('\'');
+          i += 2;
+          simplified_count++;
+          break;
+        }
+      }
+      default: {
+        simplified->push_back(code[i]);
+        break;
+      }
+    }
   }
+
+  if (simplified_count > 0) {
+    Debug("Simplified %d characters, ", simplified_count);
+    Debug("old size %d, new size %d\n", code_size, simplified->size());
+    return true;
+  }
+  return false;
+}
+
+Fragment GetDefinition(const std::string& var, const std::vector<char>& code) {
+  Debug("GetDefinition %s, code size %zu\n", var.c_str(), code.size());
+  bool is_ascii = simdutf::validate_ascii(code.data(), code.size());
+
+  if (is_ascii) {
+    Debug("ASCII-only, static size %zu\n", code.size());
+    return GetDefinitionImpl<char>(code, var, CodeType::kAscii);
+  }
+
+  std::vector<char> latin1(code.size());
+  auto result = simdutf::convert_utf8_to_latin1_with_errors(
+      code.data(), code.size(), latin1.data());
+  if (!result.error) {
+    latin1.resize(result.count);
+    Debug("Latin-1-only, old size %zu, new size %zu\n",
+          code.size(),
+          latin1.size());
+    return GetDefinitionImpl<char>(latin1, var, CodeType::kLatin1);
+  }
+
+  // Since V8 only supports Latin-1 and UTF16 as underlying representation
+  // we have to encode all files containing two-byte characters as UTF16.
+  // While some files do need two-byte characters, some just
+  // unintentionally have them. Replace certain characters that are known
+  // to have sane one-byte equivalent to save space.
+  std::vector<char> simplified;
+  if (Simplify(code, var, &simplified)) {  // Changed.
+    Debug("%s is simplified, re-generate definition\n", var.c_str());
+    return GetDefinition(var, simplified);
+  }
+
+  // Simplification did not turn the code into 1-byte string. Just
+  // use the original.
+  return GetDefinitionImpl<uint16_t>(code, var, CodeType::kTwoByte);
 }
 
 int AddModule(const std::string& filename,