Skip to content

Commit

Permalink
Add TextEncoder.prototype.encodeInto()
Browse files Browse the repository at this point in the history
Summary: Implement TextEncoder's `encodeInto()` function.

Reviewed By: avp

Differential Revision: D53216139

fbshipit-source-id: eb4f5a1461084d22c77a7c0723de624f50468785
  • Loading branch information
dannysu authored and facebook-github-bot committed Feb 17, 2024
1 parent 7f9d9d5 commit 3863a36
Show file tree
Hide file tree
Showing 6 changed files with 257 additions and 29 deletions.
10 changes: 10 additions & 0 deletions include/hermes/Support/UTF8.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,16 @@ bool convertUTF16ToUTF8WithReplacements(
llvh::ArrayRef<char16_t> input,
size_t maxCharacters = 0);

/// Convert a UTF-16 encoded string \p input to a pre-allocated UTF-8 buffer
/// \p outBuffer of length \p outBufferLength, replacing unpaired surrogates
/// halves with the Unicode replacement character.
/// \return a std::pair with the first element being the number of UTF-16
/// characters converted, and the second element being the number of UTF-8
/// characters written
std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
llvh::MutableArrayRef<uint8_t> outBuffer,
llvh::ArrayRef<char16_t> input);

/// Convert a UTF-8 encoded string (with surrogates) \p input to a UTF-8 one
/// (without surrogates), storing the conversion in \p output. Output characters
/// are appended to \p output.
Expand Down
1 change: 1 addition & 0 deletions include/hermes/VM/NativeFunctions.def
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ NATIVE_FUNCTION(symbolPrototypeValueOf)
NATIVE_FUNCTION(textEncoderConstructor)
NATIVE_FUNCTION(textEncoderPrototypeEncoding)
NATIVE_FUNCTION(textEncoderPrototypeEncode)
NATIVE_FUNCTION(textEncoderPrototypeEncodeInto)
NATIVE_FUNCTION(throwTypeError)
NATIVE_FUNCTION(typedArrayBaseConstructor)
NATIVE_FUNCTION(typedArrayFrom)
Expand Down
3 changes: 3 additions & 0 deletions include/hermes/VM/PredefinedStrings.def
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,9 @@ STR(squareSymbolSplit, "[Symbol.split]")

STR(TextEncoder, "TextEncoder")
STR(encode, "encode")
STR(encodeInto, "encodeInto")
STR(read, "read")
STR(written, "written")
STR(encoding, "encoding")
STR(utf8, "utf-8")

Expand Down
113 changes: 84 additions & 29 deletions lib/Support/UTF8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,39 @@ void encodeUTF8(char *&dst, uint32_t cp) {
dst = d;
}

/// The following logic is a combination of ES14 11.1.4 CodePointAt() and
/// what https://infra.spec.whatwg.org/#strings says about what to do with
/// singular surrogates: "To convert a string into a scalar value string,
/// replace any surrogates with U+FFFD." Therefore, if we encounter any lone
/// surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
/// The result of this process is that the enclosing for-loop processes only
/// scalar values (aka a code point that is not a surrogate).
/// \param cur Iterator pointing to the current character
/// \param end Iterator pointing to the end of the string
/// \return std::pair with first element being the Unicode code point, and the
/// second being how many code point units were consumed
static std::pair<char32_t, size_t> convertToCodePointAt(
llvh::ArrayRef<char16_t>::iterator cur,
llvh::ArrayRef<char16_t>::iterator end) {
char16_t c = cur[0];
if (isLowSurrogate(c)) {
// Unpaired low surrogate.
return {UNICODE_REPLACEMENT_CHARACTER, 1};
} else if (isHighSurrogate(c)) {
// Leading high surrogate. See if the next character is a low surrogate.
if (cur + 1 == end || !isLowSurrogate(cur[1])) {
// Trailing or unpaired high surrogate.
return {UNICODE_REPLACEMENT_CHARACTER, 1};
} else {
// Decode surrogate pair and increment, because we consumed two chars.
return {utf16SurrogatePairToCodePoint(c, cur[1]), 2};
}
} else {
// Not a surrogate.
return {c, 1};
}
}

bool convertUTF16ToUTF8WithReplacements(
std::string &out,
llvh::ArrayRef<char16_t> input,
Expand All @@ -85,40 +118,62 @@ bool convertUTF16ToUTF8WithReplacements(
continue;
}

// The following logic is a combination of ES14 11.1.4 CodePointAt() and
// what https://infra.spec.whatwg.org/#strings says about what to do with
// singular surrogates: "To convert a string into a scalar value string,
// replace any surrogates with U+FFFD." Therefore, if we encounter any lone
// surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
// The result of this process is that the enclosing for-loop processes only
// scalar values (aka a code point that is not a surrogate).
char32_t c32;
if (isLowSurrogate(cur[0])) {
// Unpaired low surrogate.
c32 = UNICODE_REPLACEMENT_CHARACTER;
} else if (isHighSurrogate(cur[0])) {
// Leading high surrogate. See if the next character is a low surrogate.
if (cur + 1 == end || !isLowSurrogate(cur[1])) {
// Trailing or unpaired high surrogate.
c32 = UNICODE_REPLACEMENT_CHARACTER;
} else {
// Decode surrogate pair and increment, because we consumed two chars.
c32 = utf16SurrogatePairToCodePoint(cur[0], cur[1]);
++cur;
auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
cur += (inputConsumed - 1);

// The code point to be encoded here is guaranteed to be a valid unicode
// code point and not a surrogate. Because of the convertToCodePointAt()
// process.
std::array<char, UTF8CodepointMaxBytes> buff;
char *ptr = buff.data();
encodeUTF8(ptr, c32);
out.insert(out.end(), buff.data(), ptr);
}
return cur == end;
}

std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
llvh::MutableArrayRef<uint8_t> outBuffer,
llvh::ArrayRef<char16_t> input) {
uint32_t numRead = 0;
uint32_t numWritten = 0;
uint8_t *writtenPtr = outBuffer.begin();
auto end = input.end();
for (auto cur = input.begin(); cur < end; ++cur) {
char16_t c = cur[0];
// ASCII fast-path.
if (LLVM_LIKELY(c <= 0x7F)) {
if (numWritten + 1 > outBuffer.size()) {
break;
}
} else {
// Not a surrogate.
c32 = c;
*writtenPtr = static_cast<char>(c);
writtenPtr++;
numWritten++;
numRead++;
continue;
}

// The code point to be converted here is guaranteed to be a valid unicode
// code point and not a surrogate. Because of the conversion above.
char buff[UTF8CodepointMaxBytes];
char *ptr = buff;
auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
cur += (inputConsumed - 1);

// The code point to be encoded here is guaranteed to be a valid unicode
// code point and not a surrogate. Because of the convertToCodePointAt()
// process.
std::array<char, UTF8CodepointMaxBytes> buff;
char *ptr = buff.data();
encodeUTF8(ptr, c32);
out.insert(out.end(), buff, ptr);

size_t convertedLength = ptr - buff.data();
if (numWritten + convertedLength > outBuffer.size()) {
break;
}
std::memcpy(writtenPtr, buff.data(), convertedLength);
writtenPtr += convertedLength;
numWritten += convertedLength;
numRead += inputConsumed;
}
return cur == end;

return {numRead, numWritten};
}

void convertUTF16ToUTF8WithSingleSurrogates(
Expand Down
103 changes: 103 additions & 0 deletions lib/VM/JSLib/TextEncoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ Handle<JSObject> createTextEncoderConstructor(Runtime &runtime) {
textEncoderPrototypeEncode,
1);

defineMethod(
runtime,
textEncoderPrototype,
Predefined::getSymbolID(Predefined::encodeInto),
nullptr,
textEncoderPrototypeEncodeInto,
2);

auto cons = defineSystemConstructor<JSObject>(
runtime,
Predefined::getSymbolID(Predefined::TextEncoder),
Expand Down Expand Up @@ -182,5 +190,100 @@ textEncoderPrototypeEncode(void *, Runtime &runtime, NativeArgs args) {
}
}

CallResult<HermesValue>
textEncoderPrototypeEncodeInto(void *, Runtime &runtime, NativeArgs args) {
GCScope gcScope{runtime};
auto selfHandle = args.dyncastThis<JSObject>();
NamedPropertyDescriptor desc;
bool exists = JSObject::getOwnNamedDescriptor(
selfHandle,
runtime,
Predefined::getSymbolID(Predefined::InternalPropertyTextEncoderType),
desc);
if (LLVM_UNLIKELY(!exists)) {
return runtime.raiseTypeError(
"TextEncoder.prototype.encodeInto() called on non-TextEncoder object");
}

auto strRes = toString_RJS(runtime, args.getArgHandle(0));
if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
}
Handle<StringPrimitive> string = runtime.makeHandle(std::move(*strRes));

Handle<Uint8Array> typedArray = args.dyncastArg<Uint8Array>(1);
if (LLVM_UNLIKELY(!typedArray)) {
return runtime.raiseTypeError("The second argument should be a Uint8Array");
}

if (LLVM_UNLIKELY(!typedArray->attached(runtime))) {
return runtime.raiseTypeError(
"TextEncoder.prototype.encodeInto() called on a detached Uint8Array");
}

PseudoHandle<JSObject> objRes = JSObject::create(runtime, 2);
Handle<JSObject> obj = runtime.makeHandle(objRes.get());

uint32_t numRead = 0;
uint32_t numWritten = 0;

if (LLVM_UNLIKELY(string->getStringLength() == 0)) {
numRead = 0;
numWritten = 0;
} else if (string->isASCII()) {
// ASCII string can trivially be converted to UTF-8 because ASCII is a
// strict subset. However, since the output array size is provided by the
// caller, we will only copy as much length as provided.
llvh::ArrayRef<char> strRef = string->getStringRef<char>();

uint32_t copiedLength =
std::min(string->getStringLength(), typedArray->getLength());

std::memcpy(typedArray->begin(runtime), strRef.data(), copiedLength);

numRead = copiedLength;
numWritten = copiedLength;
} else {
// Convert UTF-16 to the given Uint8Array
llvh::ArrayRef<char16_t> strRef = string->getStringRef<char16_t>();
std::pair<uint32_t, uint32_t> result =
convertUTF16ToUTF8BufferWithReplacements(
llvh::makeMutableArrayRef<uint8_t>(
typedArray->begin(runtime), typedArray->getLength()),
strRef);
numRead = result.first;
numWritten = result.second;
}

// Construct the result JSObject containing information about how much data
// was converted
auto numReadHandle =
runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numRead));
auto numWrittenHandle =
runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numWritten));

auto res = JSObject::defineNewOwnProperty(
obj,
runtime,
Predefined::getSymbolID(Predefined::read),
PropertyFlags::defaultNewNamedPropertyFlags(),
numReadHandle);
if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
}

res = JSObject::defineNewOwnProperty(
obj,
runtime,
Predefined::getSymbolID(Predefined::written),
PropertyFlags::defaultNewNamedPropertyFlags(),
numWrittenHandle);
if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
}

return obj.getHermesValue();
}

} // namespace vm
} // namespace hermes
56 changes: 56 additions & 0 deletions test/hermes/text-encoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,59 @@ print(result.length, result.join(' '));
result = encoder.encode('\u{D83D}');
print(result.length, result.join(' '));
// CHECK-NEXT: 3 239 191 189

result = new Uint8Array(4);

try {
const b = {};
TextEncoder.prototype.encodeInto.call(b, '', result);
} catch (e) {
print(e.message);
// CHECK-NEXT: TextEncoder.prototype.encodeInto() called on non-TextEncoder object
}

// Test the ASCII case that just fits within the provided buffer
let stats = encoder.encodeInto('test', result);
print(stats.read, stats.written);
// CHECK-NEXT: 4 4
print(result[0], result[1], result[2], result[3]);
// CHECK-NEXT: 116 101 115 116

stats = encoder.encodeInto('', result);
print(stats.read, stats.written);
// CHECK-NEXT: 0 0

// ASCII case that does NOT fit within the provided buffer
stats = encoder.encodeInto('testing', result);
print(stats.read, stats.written);
// CHECK-NEXT: 4 4
print(result[0], result[1], result[2], result[3]);
// CHECK-NEXT: 116 101 115 116

// ASCII case that is smaller than the provided buffer
stats = encoder.encodeInto('abc', result);
print(stats.read, stats.written);
// CHECK-NEXT: 3 3
print(result[0], result[1], result[2]);
// CHECK-NEXT: 97 98 99

// UTF-16 case that fits within the provided buffer
stats = encoder.encodeInto('\u{2191}', result);
print(stats.read, stats.written);
// CHECK-NEXT: 1 3
print(result[0], result[1], result[2]);
// CHECK-NEXT: 226 134 145

// UTF-16 case that does NOT fit within the provided buffer
stats = encoder.encodeInto('\u{2191}\u{2192}', result);
print(stats.read, stats.written);
// CHECK-NEXT: 1 3
print(result[0], result[1], result[2]);
// CHECK-NEXT: 226 134 145

// Surrogate case that just fits within the provided buffer
stats = encoder.encodeInto('\u{D83D}\u{DE03}', result);
print(stats.read, stats.written);
// CHECK-NEXT: 2 4
print(result[0], result[1], result[2], result[3]);
// CHECK-NEXT: 240 159 152 131

0 comments on commit 3863a36

Please sign in to comment.