Add TextEncoder.prototype.encodeInto()

Summary: Implement TextEncoder's `encodeInto()` function. Reviewed By: avp Differential Revision: D53216139 fbshipit-source-id: eb4f5a1461084d22c77a7c0723de624f50468785
facebook · Feb 17, 2024 · 3863a36 · 3863a36
1 parent 7f9d9d5
commit 3863a36
Show file tree

Hide file tree

Showing 6 changed files with 257 additions and 29 deletions.
diff --git a/include/hermes/Support/UTF8.h b/include/hermes/Support/UTF8.h
@@ -245,6 +245,16 @@ bool convertUTF16ToUTF8WithReplacements(
     llvh::ArrayRef<char16_t> input,
     size_t maxCharacters = 0);
 
+/// Convert a UTF-16 encoded string \p input to a pre-allocated UTF-8 buffer
+/// \p outBuffer of length \p outBufferLength, replacing unpaired surrogates
+/// halves with the Unicode replacement character.
+/// \return a std::pair with the first element being the number of UTF-16
+///   characters converted, and the second element being the number of UTF-8
+///   characters written
+std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
+    llvh::MutableArrayRef<uint8_t> outBuffer,
+    llvh::ArrayRef<char16_t> input);
+
 /// Convert a UTF-8 encoded string (with surrogates) \p input to a UTF-8 one
 /// (without surrogates), storing the conversion in \p output. Output characters
 /// are appended to \p output.

diff --git a/include/hermes/VM/NativeFunctions.def b/include/hermes/VM/NativeFunctions.def
@@ -364,6 +364,7 @@ NATIVE_FUNCTION(symbolPrototypeValueOf)
 NATIVE_FUNCTION(textEncoderConstructor)
 NATIVE_FUNCTION(textEncoderPrototypeEncoding)
 NATIVE_FUNCTION(textEncoderPrototypeEncode)
+NATIVE_FUNCTION(textEncoderPrototypeEncodeInto)
 NATIVE_FUNCTION(throwTypeError)
 NATIVE_FUNCTION(typedArrayBaseConstructor)
 NATIVE_FUNCTION(typedArrayFrom)

diff --git a/include/hermes/VM/PredefinedStrings.def b/include/hermes/VM/PredefinedStrings.def
@@ -486,6 +486,9 @@ STR(squareSymbolSplit, "[Symbol.split]")
 
 STR(TextEncoder, "TextEncoder")
 STR(encode, "encode")
+STR(encodeInto, "encodeInto")
+STR(read, "read")
+STR(written, "written")
 STR(encoding, "encoding")
 STR(utf8, "utf-8")
 

diff --git a/lib/Support/UTF8.cpp b/lib/Support/UTF8.cpp
@@ -63,6 +63,39 @@ void encodeUTF8(char *&dst, uint32_t cp) {
   dst = d;
 }
 
+/// The following logic is a combination of ES14 11.1.4 CodePointAt() and
+/// what https://infra.spec.whatwg.org/#strings says about what to do with
+/// singular surrogates: "To convert a string into a scalar value string,
+/// replace any surrogates with U+FFFD." Therefore, if we encounter any lone
+/// surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
+/// The result of this process is that the enclosing for-loop processes only
+/// scalar values (aka a code point that is not a surrogate).
+/// \param cur Iterator pointing to the current character
+/// \param end Iterator pointing to the end of the string
+/// \return std::pair with first element being the Unicode code point, and the
+///         second being how many code point units were consumed
+static std::pair<char32_t, size_t> convertToCodePointAt(
+    llvh::ArrayRef<char16_t>::iterator cur,
+    llvh::ArrayRef<char16_t>::iterator end) {
+  char16_t c = cur[0];
+  if (isLowSurrogate(c)) {
+    // Unpaired low surrogate.
+    return {UNICODE_REPLACEMENT_CHARACTER, 1};
+  } else if (isHighSurrogate(c)) {
+    // Leading high surrogate. See if the next character is a low surrogate.
+    if (cur + 1 == end || !isLowSurrogate(cur[1])) {
+      // Trailing or unpaired high surrogate.
+      return {UNICODE_REPLACEMENT_CHARACTER, 1};
+    } else {
+      // Decode surrogate pair and increment, because we consumed two chars.
+      return {utf16SurrogatePairToCodePoint(c, cur[1]), 2};
+    }
+  } else {
+    // Not a surrogate.
+    return {c, 1};
+  }
+}
+
 bool convertUTF16ToUTF8WithReplacements(
     std::string &out,
     llvh::ArrayRef<char16_t> input,
@@ -85,40 +118,62 @@ bool convertUTF16ToUTF8WithReplacements(
       continue;
     }
 
-    // The following logic is a combination of ES14 11.1.4 CodePointAt() and
-    // what https://infra.spec.whatwg.org/#strings says about what to do with
-    // singular surrogates: "To convert a string into a scalar value string,
-    // replace any surrogates with U+FFFD." Therefore, if we encounter any lone
-    // surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
-    // The result of this process is that the enclosing for-loop processes only
-    // scalar values (aka a code point that is not a surrogate).
-    char32_t c32;
-    if (isLowSurrogate(cur[0])) {
-      // Unpaired low surrogate.
-      c32 = UNICODE_REPLACEMENT_CHARACTER;
-    } else if (isHighSurrogate(cur[0])) {
-      // Leading high surrogate. See if the next character is a low surrogate.
-      if (cur + 1 == end || !isLowSurrogate(cur[1])) {
-        // Trailing or unpaired high surrogate.
-        c32 = UNICODE_REPLACEMENT_CHARACTER;
-      } else {
-        // Decode surrogate pair and increment, because we consumed two chars.
-        c32 = utf16SurrogatePairToCodePoint(cur[0], cur[1]);
-        ++cur;
+    auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
+    cur += (inputConsumed - 1);
+
+    // The code point to be encoded here is guaranteed to be a valid unicode
+    // code point and not a surrogate. Because of the convertToCodePointAt()
+    // process.
+    std::array<char, UTF8CodepointMaxBytes> buff;
+    char *ptr = buff.data();
+    encodeUTF8(ptr, c32);
+    out.insert(out.end(), buff.data(), ptr);
+  }
+  return cur == end;
+}
+
+std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
+    llvh::MutableArrayRef<uint8_t> outBuffer,
+    llvh::ArrayRef<char16_t> input) {
+  uint32_t numRead = 0;
+  uint32_t numWritten = 0;
+  uint8_t *writtenPtr = outBuffer.begin();
+  auto end = input.end();
+  for (auto cur = input.begin(); cur < end; ++cur) {
+    char16_t c = cur[0];
+    // ASCII fast-path.
+    if (LLVM_LIKELY(c <= 0x7F)) {
+      if (numWritten + 1 > outBuffer.size()) {
+        break;
       }
-    } else {
-      // Not a surrogate.
-      c32 = c;
+      *writtenPtr = static_cast<char>(c);
+      writtenPtr++;
+      numWritten++;
+      numRead++;
+      continue;
     }
 
-    // The code point to be converted here is guaranteed to be a valid unicode
-    // code point and not a surrogate. Because of the conversion above.
-    char buff[UTF8CodepointMaxBytes];
-    char *ptr = buff;
+    auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
+    cur += (inputConsumed - 1);
+
+    // The code point to be encoded here is guaranteed to be a valid unicode
+    // code point and not a surrogate. Because of the convertToCodePointAt()
+    // process.
+    std::array<char, UTF8CodepointMaxBytes> buff;
+    char *ptr = buff.data();
     encodeUTF8(ptr, c32);
-    out.insert(out.end(), buff, ptr);
+
+    size_t convertedLength = ptr - buff.data();
+    if (numWritten + convertedLength > outBuffer.size()) {
+      break;
+    }
+    std::memcpy(writtenPtr, buff.data(), convertedLength);
+    writtenPtr += convertedLength;
+    numWritten += convertedLength;
+    numRead += inputConsumed;
   }
-  return cur == end;
+
+  return {numRead, numWritten};
 }
 
 void convertUTF16ToUTF8WithSingleSurrogates(

diff --git a/lib/VM/JSLib/TextEncoder.cpp b/lib/VM/JSLib/TextEncoder.cpp
@@ -51,6 +51,14 @@ Handle<JSObject> createTextEncoderConstructor(Runtime &runtime) {
       textEncoderPrototypeEncode,
       1);
 
+  defineMethod(
+      runtime,
+      textEncoderPrototype,
+      Predefined::getSymbolID(Predefined::encodeInto),
+      nullptr,
+      textEncoderPrototypeEncodeInto,
+      2);
+
   auto cons = defineSystemConstructor<JSObject>(
       runtime,
       Predefined::getSymbolID(Predefined::TextEncoder),
@@ -182,5 +190,100 @@ textEncoderPrototypeEncode(void *, Runtime &runtime, NativeArgs args) {
   }
 }
 
+CallResult<HermesValue>
+textEncoderPrototypeEncodeInto(void *, Runtime &runtime, NativeArgs args) {
+  GCScope gcScope{runtime};
+  auto selfHandle = args.dyncastThis<JSObject>();
+  NamedPropertyDescriptor desc;
+  bool exists = JSObject::getOwnNamedDescriptor(
+      selfHandle,
+      runtime,
+      Predefined::getSymbolID(Predefined::InternalPropertyTextEncoderType),
+      desc);
+  if (LLVM_UNLIKELY(!exists)) {
+    return runtime.raiseTypeError(
+        "TextEncoder.prototype.encodeInto() called on non-TextEncoder object");
+  }
+
+  auto strRes = toString_RJS(runtime, args.getArgHandle(0));
+  if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) {
+    return ExecutionStatus::EXCEPTION;
+  }
+  Handle<StringPrimitive> string = runtime.makeHandle(std::move(*strRes));
+
+  Handle<Uint8Array> typedArray = args.dyncastArg<Uint8Array>(1);
+  if (LLVM_UNLIKELY(!typedArray)) {
+    return runtime.raiseTypeError("The second argument should be a Uint8Array");
+  }
+
+  if (LLVM_UNLIKELY(!typedArray->attached(runtime))) {
+    return runtime.raiseTypeError(
+        "TextEncoder.prototype.encodeInto() called on a detached Uint8Array");
+  }
+
+  PseudoHandle<JSObject> objRes = JSObject::create(runtime, 2);
+  Handle<JSObject> obj = runtime.makeHandle(objRes.get());
+
+  uint32_t numRead = 0;
+  uint32_t numWritten = 0;
+
+  if (LLVM_UNLIKELY(string->getStringLength() == 0)) {
+    numRead = 0;
+    numWritten = 0;
+  } else if (string->isASCII()) {
+    // ASCII string can trivially be converted to UTF-8 because ASCII is a
+    // strict subset. However, since the output array size is provided by the
+    // caller, we will only copy as much length as provided.
+    llvh::ArrayRef<char> strRef = string->getStringRef<char>();
+
+    uint32_t copiedLength =
+        std::min(string->getStringLength(), typedArray->getLength());
+
+    std::memcpy(typedArray->begin(runtime), strRef.data(), copiedLength);
+
+    numRead = copiedLength;
+    numWritten = copiedLength;
+  } else {
+    // Convert UTF-16 to the given Uint8Array
+    llvh::ArrayRef<char16_t> strRef = string->getStringRef<char16_t>();
+    std::pair<uint32_t, uint32_t> result =
+        convertUTF16ToUTF8BufferWithReplacements(
+            llvh::makeMutableArrayRef<uint8_t>(
+                typedArray->begin(runtime), typedArray->getLength()),
+            strRef);
+    numRead = result.first;
+    numWritten = result.second;
+  }
+
+  // Construct the result JSObject containing information about how much data
+  // was converted
+  auto numReadHandle =
+      runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numRead));
+  auto numWrittenHandle =
+      runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numWritten));
+
+  auto res = JSObject::defineNewOwnProperty(
+      obj,
+      runtime,
+      Predefined::getSymbolID(Predefined::read),
+      PropertyFlags::defaultNewNamedPropertyFlags(),
+      numReadHandle);
+  if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
+    return ExecutionStatus::EXCEPTION;
+  }
+
+  res = JSObject::defineNewOwnProperty(
+      obj,
+      runtime,
+      Predefined::getSymbolID(Predefined::written),
+      PropertyFlags::defaultNewNamedPropertyFlags(),
+      numWrittenHandle);
+  if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
+    return ExecutionStatus::EXCEPTION;
+  }
+
+  return obj.getHermesValue();
+}
+
 } // namespace vm
 } // namespace hermes
diff --git a/test/hermes/text-encoder.js b/test/hermes/text-encoder.js
@@ -64,3 +64,59 @@ print(result.length, result.join(' '));
 result = encoder.encode('\u{D83D}');
 print(result.length, result.join(' '));
 // CHECK-NEXT: 3 239 191 189
+
+result = new Uint8Array(4);
+
+try {
+  const b = {};
+  TextEncoder.prototype.encodeInto.call(b, '', result);
+} catch (e) {
+  print(e.message);
+  // CHECK-NEXT: TextEncoder.prototype.encodeInto() called on non-TextEncoder object
+}
+
+// Test the ASCII case that just fits within the provided buffer
+let stats = encoder.encodeInto('test', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 4 4
+print(result[0], result[1], result[2], result[3]);
+// CHECK-NEXT: 116 101 115 116
+
+stats = encoder.encodeInto('', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 0 0
+
+// ASCII case that does NOT fit within the provided buffer
+stats = encoder.encodeInto('testing', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 4 4
+print(result[0], result[1], result[2], result[3]);
+// CHECK-NEXT: 116 101 115 116
+
+// ASCII case that is smaller than the provided buffer
+stats = encoder.encodeInto('abc', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 3 3
+print(result[0], result[1], result[2]);
+// CHECK-NEXT: 97 98 99
+
+// UTF-16 case that fits within the provided buffer
+stats = encoder.encodeInto('\u{2191}', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 1 3
+print(result[0], result[1], result[2]);
+// CHECK-NEXT: 226 134 145
+
+// UTF-16 case that does NOT fit within the provided buffer
+stats = encoder.encodeInto('\u{2191}\u{2192}', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 1 3
+print(result[0], result[1], result[2]);
+// CHECK-NEXT: 226 134 145
+
+// Surrogate case that just fits within the provided buffer
+stats = encoder.encodeInto('\u{D83D}\u{DE03}', result);
+print(stats.read, stats.written);
+// CHECK-NEXT: 2 4
+print(result[0], result[1], result[2], result[3]);
+// CHECK-NEXT: 240 159 152 131