From 17f4995b98cc486e94fb7f9d50fe6939a7d6368b Mon Sep 17 00:00:00 2001 From: Tim Chevalier Date: Fri, 25 Oct 2024 15:16:51 -0700 Subject: [PATCH] ICU-22953 MF2: Allow unpaired surrogates in text and quoted literals See https://github.com/unicode-org/message-format-wg/pull/906/ --- icu4c/source/i18n/messageformat2_parser.cpp | 4 ++-- .../test/intltest/messageformat2test.cpp | 24 +++++++++++++++++-- .../source/test/intltest/messageformat2test.h | 1 + 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp index b4768756c5ea..22cb52266eaa 100644 --- a/icu4c/source/i18n/messageformat2_parser.cpp +++ b/icu4c/source/i18n/messageformat2_parser.cpp @@ -121,8 +121,8 @@ static bool isContentChar(UChar32 c) { || inRange(c, 0x002F, 0x003F) // Omit '@' || inRange(c, 0x0041, 0x005B) // Omit '\' || inRange(c, 0x005D, 0x007A) // Omit { | } - || inRange(c, 0x007E, 0xD7FF) // Omit surrogates - || inRange(c, 0xE000, 0x10FFFF); + || inRange(c, 0x007E, 0x2FFF) // Omit IDEOGRAPHIC_SPACE + || inRange(c, 0x3001, 0x10FFFF); // Allowing surrogates is intentional } // See `s` in the MessageFormat 2 grammar diff --git a/icu4c/source/test/intltest/messageformat2test.cpp b/icu4c/source/test/intltest/messageformat2test.cpp index 353082ef5c91..68a10191318a 100644 --- a/icu4c/source/test/intltest/messageformat2test.cpp +++ b/icu4c/source/test/intltest/messageformat2test.cpp @@ -33,6 +33,7 @@ TestMessageFormat2::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(testFormatterAPI); TESTCASE_AUTO(testHighLoneSurrogate); TESTCASE_AUTO(testLowLoneSurrogate); + TESTCASE_AUTO(testLoneSurrogateInQuotedLiteral); TESTCASE_AUTO(dataDrivenTests); TESTCASE_AUTO_END; } @@ -350,7 +351,8 @@ void TestMessageFormat2::testHighLoneSurrogate() { .setPattern(loneSurrogate, pe, errorCode) .build(errorCode); UnicodeString result = msgfmt1.formatToString({}, errorCode); - errorCode.expectErrorAndReset(U_MF_SYNTAX_ERROR, "testHighLoneSurrogate"); + assertEquals("testHighLoneSurrogate", loneSurrogate, result); + errorCode.errIfFailureAndReset("testHighLoneSurrogate"); } // ICU-22890 lone surrogate cause infinity loop @@ -364,7 +366,25 @@ void TestMessageFormat2::testLowLoneSurrogate() { .setPattern(loneSurrogate, pe, errorCode) .build(errorCode); UnicodeString result = msgfmt2.formatToString({}, errorCode); - errorCode.expectErrorAndReset(U_MF_SYNTAX_ERROR, "testLowLoneSurrogate"); + assertEquals("testLowLoneSurrogate", loneSurrogate, result); + errorCode.errIfFailureAndReset("testLowLoneSurrogate"); +} + +void TestMessageFormat2::testLoneSurrogateInQuotedLiteral() { + IcuTestErrorCode errorCode(*this, "testLoneSurrogateInQuotedLiteral"); + UParseError pe = { 0, 0, {0}, {0} }; + // |\udc02| + UnicodeString literal("{|"); + literal += 0xdc02; + literal += "|}"; + UnicodeString expectedResult({0xdc02, 0}); + icu::message2::MessageFormatter msgfmt2 = + icu::message2::MessageFormatter::Builder(errorCode) + .setPattern(literal, pe, errorCode) + .build(errorCode); + UnicodeString result = msgfmt2.formatToString({}, errorCode); + assertEquals("testLoneSurrogateInQuotedLiteral", expectedResult, result); + errorCode.errIfFailureAndReset("testLoneSurrogateInQuotedLiteral"); } void TestMessageFormat2::dataDrivenTests() { diff --git a/icu4c/source/test/intltest/messageformat2test.h b/icu4c/source/test/intltest/messageformat2test.h index 71dfb3916c9b..6ac2f1584e19 100644 --- a/icu4c/source/test/intltest/messageformat2test.h +++ b/icu4c/source/test/intltest/messageformat2test.h @@ -91,6 +91,7 @@ class TestMessageFormat2: public IntlTest { void testHighLoneSurrogate(void); void testLowLoneSurrogate(void); + void testLoneSurrogateInQuotedLiteral(void); }; // class TestMessageFormat2 U_NAMESPACE_BEGIN