Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deps: update icu to 75.1 #52573

Merged
merged 1 commit into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 3 additions & 1 deletion deps/icu-small/LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ UNICODE LICENSE V3

COPYRIGHT AND PERMISSION NOTICE

Copyright © 2016-2023 Unicode, Inc.
Copyright © 2016-2024 Unicode, Inc.

NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
Expand Down Expand Up @@ -38,6 +38,8 @@ not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.

SPDX-License-Identifier: Unicode-3.0

----------------------------------------------------------------------

Third-Party Software Licenses
Expand Down
4 changes: 2 additions & 2 deletions deps/icu-small/README-FULL-ICU.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ICU sources - auto generated by shrink-icu-src.py

This directory contains the ICU subset used by --with-intl=full-icu
It is a strict subset of ICU 74 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt74l.dat.bz2 : compressed data file
It is a strict subset of ICU 75 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt75l.dat.bz2 : compressed data file


To rebuild this directory, see ../../tools/icu/README.md
Expand Down
6 changes: 2 additions & 4 deletions deps/icu-small/source/common/brkeng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,11 @@ UnhandledEngine::handleCharacter(UChar32 c) {
*/

ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
fEngines = 0;
fEngines = nullptr;
}

ICULanguageBreakFactory::~ICULanguageBreakFactory() {
if (fEngines != 0) {
delete fEngines;
}
delete fEngines;
}

void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
Expand Down
9 changes: 3 additions & 6 deletions deps/icu-small/source/common/brkiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,17 +438,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
uprv_strcpy(lb_lw, "line");
UErrorCode kvStatus = U_ZERO_ERROR;
CharString value;
CharStringByteSink valueSink(&value);
loc.getKeywordValue("lb", valueSink, kvStatus);
auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
uprv_strcat(lb_lw, "_");
uprv_strcat(lb_lw, value.data());
}
// lw=phrase is only supported in Japanese and Korean
if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
value.clear();
loc.getKeywordValue("lw", valueSink, kvStatus);
value = loc.getKeywordValue<CharString>("lw", kvStatus);
if (U_SUCCESS(kvStatus) && value == "phrase") {
uprv_strcat(lb_lw, "_");
uprv_strcat(lb_lw, value.data());
Expand Down Expand Up @@ -500,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return Locale(requestLocale);
return {requestLocale};
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
Expand Down
112 changes: 90 additions & 22 deletions deps/icu-small/source/common/bytesinkutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,52 @@
#ifndef BYTESINKUTIL_H
#define BYTESINKUTIL_H

#include <type_traits>

#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/edits.h"
#include "charstr.h"
#include "cmemory.h"
#include "uassert.h"
#include "ustr_imp.h"

U_NAMESPACE_BEGIN

class ByteSink;
class CharString;
class Edits;

class U_COMMON_API CharStringByteSink : public ByteSink {
public:
CharStringByteSink(CharString* dest);
~CharStringByteSink() override;

CharStringByteSink() = delete;
CharStringByteSink(const CharStringByteSink&) = delete;
CharStringByteSink& operator=(const CharStringByteSink&) = delete;

void Append(const char* bytes, int32_t n) override;

char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) override;

private:
CharString& dest_;
};

// CharString doesn't provide the public API that StringByteSink requires a
// string class to have so this template specialization replaces the default
// implementation of StringByteSink<CharString> with CharStringByteSink.
template<>
class StringByteSink<CharString> : public CharStringByteSink {
public:
StringByteSink(CharString* dest) : CharStringByteSink(dest) { }
StringByteSink(CharString* dest, int32_t /*initialAppendCapacity*/) : CharStringByteSink(dest) { }
};

class U_COMMON_API ByteSinkUtil {
public:
ByteSinkUtil() = delete; // all static
Expand Down Expand Up @@ -57,30 +91,64 @@ class U_COMMON_API ByteSinkUtil {
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);

private:
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits);
};

class U_COMMON_API CharStringByteSink : public ByteSink {
public:
CharStringByteSink(CharString* dest);
~CharStringByteSink() override;

CharStringByteSink() = delete;
CharStringByteSink(const CharStringByteSink&) = delete;
CharStringByteSink& operator=(const CharStringByteSink&) = delete;

void Append(const char* bytes, int32_t n) override;
/**
* Calls a lambda that writes to a ByteSink with a CheckedArrayByteSink
* and then returns through u_terminateChars(), in order to implement
* the classic ICU4C C API writing to a fix sized buffer on top of a
* contemporary C++ API.
*
* @param buffer receiving buffer
* @param capacity capacity of receiving buffer
* @param lambda that gets called with the sink as an argument
* @param status set to U_BUFFER_OVERFLOW_ERROR on overflow
* @return number of bytes written, or needed (in case of overflow)
* @internal
*/
template <typename F,
typename = std::enable_if_t<
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
static int32_t viaByteSinkToTerminatedChars(char* buffer, int32_t capacity,
F&& lambda,
UErrorCode& status) {
if (U_FAILURE(status)) { return 0; }
CheckedArrayByteSink sink(buffer, capacity);
lambda(sink, status);
if (U_FAILURE(status)) { return 0; }

int32_t reslen = sink.NumberOfBytesAppended();

if (sink.Overflowed()) {
status = U_BUFFER_OVERFLOW_ERROR;
return reslen;
}

return u_terminateChars(buffer, capacity, reslen, &status);
}

char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) override;
/**
* Calls a lambda that writes to a ByteSink with a CharStringByteSink and
* then returns a CharString, in order to implement a contemporary C++ API
* on top of a C/C++ compatibility ByteSink API.
*
* @param lambda that gets called with the sink as an argument
* @param status to check and report
* @return the resulting string, or an empty string (in case of error)
* @internal
*/
template <typename F,
typename = std::enable_if_t<
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
static CharString viaByteSinkToCharString(F&& lambda, UErrorCode& status) {
if (U_FAILURE(status)) { return {}; }
CharString result;
CharStringByteSink sink(&result);
lambda(sink, status);
return result;
}

private:
CharString& dest_;
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits);
};

U_NAMESPACE_END
Expand Down
50 changes: 29 additions & 21 deletions deps/icu-small/source/common/caniter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)


/**
*@param source string to get results for
*/
Expand All @@ -73,10 +74,10 @@ CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode
pieces_lengths(nullptr),
current(nullptr),
current_length(0),
nfd(*Normalizer2::getNFDInstance(status)),
nfcImpl(*Normalizer2Factory::getNFCImpl(status))
nfd(Normalizer2::getNFDInstance(status)),
nfcImpl(Normalizer2Factory::getNFCImpl(status))
{
if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {
if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {
setSource(sourceStr, status);
}
}
Expand Down Expand Up @@ -172,7 +173,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
int32_t i = 0;
UnicodeString *list = nullptr;

nfd.normalize(newSource, source, status);
nfd->normalize(newSource, source, status);
if(U_FAILURE(status)) {
return;
}
Expand All @@ -194,7 +195,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
current[0] = 0;
pieces[0] = new UnicodeString[1];
pieces_lengths[0] = 1;
if (pieces[0] == 0) {
if (pieces[0] == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
Expand All @@ -203,7 +204,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st


list = new UnicodeString[source.length()];
if (list == 0) {
if (list == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
Expand All @@ -219,7 +220,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
// on the NFD form - see above).
for (; i < source.length(); i += U16_LENGTH(cp)) {
cp = source.char32At(i);
if (nfcImpl.isCanonSegmentStarter(cp)) {
if (nfcImpl->isCanonSegmentStarter(cp)) {
source.extract(start, i-start, list[list_length++]); // add up to i
start = i;
}
Expand Down Expand Up @@ -252,9 +253,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
return;
// Common section to cleanup all local variables and reset object variables.
CleanPartialInitialization:
if (list != nullptr) {
delete[] list;
}
delete[] list;
cleanPieces();
}

Expand All @@ -264,10 +263,19 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
* @param source the string to find permutations for
* @return the results in a set.
*/
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {
if(U_FAILURE(status)) {
return;
}
// To avoid infinity loop caused by permute, we limit the depth of recursive
// call to permute and return U_UNSUPPORTED_ERROR.
// We know in some unit test we need at least 4. Set to 8 just in case some
// unforseen use cases.
constexpr int32_t kPermuteDepthLimit = 8;
if (depth > kPermuteDepthLimit) {
status = U_UNSUPPORTED_ERROR;
return;
}
//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
int32_t i = 0;

Expand All @@ -277,7 +285,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros
if (source.length() <= 2 && source.countChar32() <= 1) {
UnicodeString *toPut = new UnicodeString(source);
/* test for nullptr */
if (toPut == 0) {
if (toPut == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
Expand Down Expand Up @@ -311,7 +319,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros

// see what the permutations of the characters before and after this one are
//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);
permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return;
Expand Down Expand Up @@ -346,7 +354,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
Hashtable permutations(status);
Hashtable basic(status);
if (U_FAILURE(status)) {
return 0;
return nullptr;
}
result.setValueDeleter(uprv_deleteUObject);
permutations.setValueDeleter(uprv_deleteUObject);
Expand Down Expand Up @@ -381,7 +389,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString attempt;
nfd.normalize(possible, attempt, status);
nfd->normalize(possible, attempt, status);

// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
if (attempt==segment) {
Expand All @@ -399,15 +407,15 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i

/* Test for buffer overflows */
if(U_FAILURE(status)) {
return 0;
return nullptr;
}
// convert into a String[] to clean up storage
//String[] finalResult = new String[result.size()];
UnicodeString *finalResult = nullptr;
int32_t resultCount;
if((resultCount = result.count()) != 0) {
finalResult = new UnicodeString[resultCount];
if (finalResult == 0) {
if (finalResult == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
Expand Down Expand Up @@ -448,7 +456,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
// see if any character is at the start of some decomposition
U16_GET(segment, 0, i, segLen, cp);
if (!nfcImpl.getCanonStartSet(cp, starts)) {
if (!nfcImpl->getCanonStartSet(cp, starts)) {
continue;
}
// if so, see which decompositions match
Expand All @@ -471,7 +479,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
UnicodeString *toAdd = new UnicodeString(prefix);
/* test for nullptr */
if (toAdd == 0) {
if (toAdd == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
Expand Down Expand Up @@ -509,7 +517,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
UnicodeString temp(comp);
int32_t inputLen=temp.length();
UnicodeString decompString;
nfd.normalize(temp, decompString, status);
nfd->normalize(temp, decompString, status);
if (U_FAILURE(status)) {
return nullptr;
}
Expand Down Expand Up @@ -573,7 +581,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
// brute force approach
// check to make sure result is canonically equivalent
UnicodeString trial;
nfd.normalize(temp, trial, status);
nfd->normalize(temp, trial, status);
if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
return nullptr;
}
Expand Down
Loading
Loading