From 911d047f3cebdf0fa76293deacae079a5f1ca57e Mon Sep 17 00:00:00 2001
From: Tim Chevalier <tjc@igalia.com>
Date: Tue, 8 Oct 2024 14:41:59 -0700
Subject: [PATCH] ICU-22940 MF2 ICU4C: Update for bidi support

Per https://github.com/unicode-org/message-format-wg/pull/884
---
 icu4c/source/i18n/messageformat2_parser.cpp   | 113 +++++++++----
 icu4c/source/i18n/messageformat2_parser.h     |   5 +-
 .../intltest/messageformat2test_read_json.cpp |   3 +
 .../test/intltest/messageformat2test_utils.h  |   3 +-
 testdata/message2/bidi.json                   | 160 ++++++++++++++++++
 5 files changed, 246 insertions(+), 38 deletions(-)
 create mode 100644 testdata/message2/bidi.json

diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp
index b4768756c5ea..513430967ad4 100644
--- a/icu4c/source/i18n/messageformat2_parser.cpp
+++ b/icu4c/source/i18n/messageformat2_parser.cpp
@@ -125,7 +125,13 @@ static bool isContentChar(UChar32 c) {
            || inRange(c, 0xE000, 0x10FFFF);
 }
 
-// See `s` in the MessageFormat 2 grammar
+// See `bidi` in the MF2 grammar
+static bool isBidi(UChar32 c) {
+    return (c == 0x061C || c == 0x200E || c == 0x200F ||
+            inRange(c, 0x2066, 0x2069));
+}
+
+// See `ws` in the MessageFormat 2 grammar
 inline bool isWhitespace(UChar32 c) {
     switch (c) {
     case SPACE:
@@ -153,8 +159,8 @@ static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
 
 static bool isNameStart(UChar32 c) {
     return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
-           inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
-           inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
+           inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x061B) ||
+           inRange(c, 0x061D, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
            inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
            inRange(c, 0x10000, 0xEFFFF);
 }
@@ -347,7 +353,7 @@ option, or the optional space before an attribute.
   No pre, no post.
   A message may end with whitespace, so `index` may equal `len()` on exit.
 */
-void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
+void Parser::parseRequiredWS(UErrorCode& errorCode) {
     bool sawWhitespace = false;
 
     // The loop exits either when we consume all the input,
@@ -358,7 +364,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
             // If whitespace isn't required -- or if we saw it already --
             // then the caller is responsible for checking this case and
             // setting an error if necessary.
-            if (!required || sawWhitespace) {
+            if (sawWhitespace) {
                 // Not an error.
                 return;
             }
@@ -380,24 +386,51 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
         }
     }
 
-    if (!sawWhitespace && required) {
+    if (!sawWhitespace) {
         ERROR(errorCode);
     }
 }
 
+void Parser::parseOptionalBidi() {
+    while (true) {
+        if (!inBounds()) {
+            return;
+        }
+        if (isBidi(peek())) {
+            next();
+        } else {
+            break;
+        }
+    }
+}
+
 /*
-  No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
+  No pre, no post, because a message may end with whitespace
+  Matches `s` in the MF2 grammar
 */
 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
-    parseWhitespaceMaybeRequired(true, errorCode);
+    parseOptionalBidi();
+    parseRequiredWS(errorCode);
+    parseOptionalWhitespace();
     normalizedInput += SPACE;
 }
 
 /*
   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
 */
-void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
-    parseWhitespaceMaybeRequired(false, errorCode);
+void Parser::parseOptionalWhitespace() {
+    while (true) {
+        if (!inBounds()) {
+            return;
+        }
+        auto cp = peek();
+        if (isWhitespace(cp) || isBidi(cp)) {
+            maybeAdvanceLine();
+            next();
+        } else {
+            break;
+        }
+    }
 }
 
 // Consumes a single character, signaling an error if `peek()` != `c`
@@ -442,11 +475,11 @@ void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode)
 */
 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
     // No need for error check or bounds check before parseOptionalWhitespace
-    parseOptionalWhitespace(errorCode);
+    parseOptionalWhitespace();
     // Establish precondition
     CHECK_BOUNDS(errorCode);
     parseToken(token, errorCode);
-    parseOptionalWhitespace(errorCode);
+    parseOptionalWhitespace();
     // Guarantee postcondition
     CHECK_BOUNDS(errorCode);
 }
@@ -458,12 +491,12 @@ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCo
    then consumes optional whitespace again
 */
 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
-    // No need for error check or bounds check before parseOptionalWhitespace(errorCode)
-    parseOptionalWhitespace(errorCode);
+    // No need for error check or bounds check before parseOptionalWhitespace()
+    parseOptionalWhitespace();
     // Establish precondition
     CHECK_BOUNDS(errorCode);
     parseToken(c, errorCode);
-    parseOptionalWhitespace(errorCode);
+    parseOptionalWhitespace();
     // Guarantee postcondition
     CHECK_BOUNDS(errorCode);
 }
@@ -482,11 +515,17 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {
 
     U_ASSERT(inBounds());
 
-    if (!isNameStart(peek())) {
+    if (!(isNameStart(peek()) || isBidi(peek()))) {
         ERROR(errorCode);
         return name;
     }
 
+    // name       = [bidi] name-start *name-char [bidi]
+
+    // [bidi]
+    parseOptionalBidi();
+
+    // name-start *name-char
     while (isNameChar(peek())) {
         UChar32 c = peek();
         name += c;
@@ -497,6 +536,10 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {
             break;
         }
     }
+
+    // [bidi]
+    parseOptionalBidi();
+
     return name;
 }
 
@@ -853,7 +896,7 @@ void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode)
     // about whether whitespace precedes another
     // attribute, or the '=' sign
     int32_t savedIndex = index;
-    parseOptionalWhitespace(errorCode);
+    parseOptionalWhitespace();
 
     Operand rand;
     if (peek() == EQUALS) {
@@ -1149,7 +1192,7 @@ the comment in `parseOptions()` for details.
       // (the character is either the required space before an annotation, or optional
       // trailing space after the literal or variable). It's still ambiguous which
       // one does apply.
-      parseOptionalWhitespace(status);
+      parseOptionalWhitespace();
       // Restore precondition
       CHECK_BOUNDS(status);
 
@@ -1220,7 +1263,7 @@ Expression Parser::parseExpression(UErrorCode& status) {
     // Parse opening brace
     parseToken(LEFT_CURLY_BRACE, status);
     // Optional whitespace after opening brace
-    parseOptionalWhitespace(status);
+    parseOptionalWhitespace();
 
     Expression::Builder exprBuilder(status);
     // Restore precondition
@@ -1263,7 +1306,7 @@ Expression Parser::parseExpression(UErrorCode& status) {
 
     // Parse optional space
     // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
-    parseOptionalWhitespace(status);
+    parseOptionalWhitespace();
 
     // Either an operand or operator (or both) must have been set already,
     // so there can't be an error
@@ -1339,7 +1382,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) {
     CHECK_BOUNDS(status);
 
     parseToken(ID_INPUT, status);
-    parseOptionalWhitespace(status);
+    parseOptionalWhitespace();
 
     // Restore precondition before calling parseExpression()
     CHECK_BOUNDS(status);
@@ -1400,7 +1443,7 @@ void Parser::parseDeclarations(UErrorCode& status) {
         // Avoid looping infinitely
         CHECK_ERROR(status);
 
-        parseOptionalWhitespace(status);
+        parseOptionalWhitespace();
         // Restore precondition
         CHECK_BOUNDS(status);
     }
@@ -1510,8 +1553,8 @@ This is addressed using "backtracking" (similarly to `parseOptions()`).
 
     // We've seen at least one whitespace-key pair, so now we can parse
     // *(s key) [s]
-    while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors
-        bool wasWhitespace = isWhitespace(peek());
+    while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidi(peek())) {
+        bool wasWhitespace = isWhitespace(peek()) || isBidi(peek());
         parseRequiredWhitespace(status);
         if (!wasWhitespace) {
             // Avoid infinite loop when parsing something like:
@@ -1569,7 +1612,7 @@ Markup Parser::parseMarkup(UErrorCode& status) {
     // Consume the '{'
     next();
     normalizedInput += LEFT_CURLY_BRACE;
-    parseOptionalWhitespace(status);
+    parseOptionalWhitespace();
     bool closing = false;
     switch (peek()) {
     case NUMBER_SIGN: {
@@ -1596,19 +1639,19 @@ Markup Parser::parseMarkup(UErrorCode& status) {
 
     // Parse the options, which must begin with a ' '
     // if present
-    if (inBounds() && isWhitespace(peek())) {
+    if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
         OptionAdder<Markup::Builder> optionAdder(builder);
         parseOptions(optionAdder, status);
     }
 
     // Parse the attributes, which also must begin
     // with a ' '
-    if (inBounds() && isWhitespace(peek())) {
+    if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
         AttributeAdder<Markup::Builder> attrAdder(builder);
         parseAttributes(attrAdder, status);
     }
 
-    parseOptionalWhitespace(status);
+    parseOptionalWhitespace();
 
     bool standalone = false;
     // Check if this is a standalone or not
@@ -1656,7 +1699,7 @@ std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
             isMarkup = true;
             break;
         }
-        if (!isWhitespace(c)){
+        if (!(isWhitespace(c) || isBidi(c))) {
             break;
         }
         tempIndex++;
@@ -1740,7 +1783,7 @@ void Parser::parseSelectors(UErrorCode& status) {
     // "Backtracking" is required here. It's not clear if whitespace is
     // (`[s]` selector) or (`[s]` variant)
     while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) {
-        parseOptionalWhitespace(status);
+        parseOptionalWhitespace();
         // Restore precondition
         CHECK_BOUNDS(status);
         if (peek() != LEFT_CURLY_BRACE) {
@@ -1770,9 +1813,9 @@ void Parser::parseSelectors(UErrorCode& status) {
         }                                          \
 
     // Parse variants
-    while (isWhitespace(peek()) || isKeyStart(peek())) {
+    while (isWhitespace(peek()) || isBidi(peek()) || isKeyStart(peek())) {
         // Trailing whitespace is allowed
-        parseOptionalWhitespace(status);
+        parseOptionalWhitespace();
         if (!inBounds()) {
             return;
         }
@@ -1871,7 +1914,7 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
     bool complex = false;
     // First, "look ahead" to determine if this is a simple or complex
     // message. To do that, check the first non-whitespace character.
-    while (inBounds(index) && isWhitespace(peek())) {
+    while (inBounds(index) && (isWhitespace(peek()) || isBidi(peek()))) {
         next();
     }
 
@@ -1891,10 +1934,10 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
     // Message can be empty, so we need to only look ahead
     // if we know it's non-empty
     if (complex) {
-        parseOptionalWhitespace(status);
+        parseOptionalWhitespace();
         parseDeclarations(status);
         parseBody(status);
-        parseOptionalWhitespace(status);
+        parseOptionalWhitespace();
     } else {
         // Simple message
         // For normalization, quote the pattern
diff --git a/icu4c/source/i18n/messageformat2_parser.h b/icu4c/source/i18n/messageformat2_parser.h
index b62cbe9200b9..a461ecca6545 100644
--- a/icu4c/source/i18n/messageformat2_parser.h
+++ b/icu4c/source/i18n/messageformat2_parser.h
@@ -102,9 +102,10 @@ namespace message2 {
         void parseInputDeclaration(UErrorCode&);
 	void parseSelectors(UErrorCode&);
 
-	void parseWhitespaceMaybeRequired(bool, UErrorCode&);
+	void parseRequiredWS(UErrorCode&);
 	void parseRequiredWhitespace(UErrorCode&);
-	void parseOptionalWhitespace(UErrorCode&);
+	void parseOptionalBidi();
+	void parseOptionalWhitespace();
 	void parseToken(UChar32, UErrorCode&);
 	void parseTokenWithWhitespace(UChar32, UErrorCode&);
 	void parseToken(const std::u16string_view&, UErrorCode&);
diff --git a/icu4c/source/test/intltest/messageformat2test_read_json.cpp b/icu4c/source/test/intltest/messageformat2test_read_json.cpp
index ddf93da632ce..96906f618f8a 100644
--- a/icu4c/source/test/intltest/messageformat2test_read_json.cpp
+++ b/icu4c/source/test/intltest/messageformat2test_read_json.cpp
@@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
     runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode);
 
     // Other tests (non-spec)
+    // TODO: https://github.com/unicode-org/message-format-wg/pull/902 will
+    // move the bidi tests into the spec
+    runTestsFromJsonFile(*this, "bidi.json", errorCode);
     runTestsFromJsonFile(*this, "more-functions.json", errorCode);
     runTestsFromJsonFile(*this, "valid-tests.json", errorCode);
     runTestsFromJsonFile(*this, "resolution-errors.json", errorCode);
diff --git a/icu4c/source/test/intltest/messageformat2test_utils.h b/icu4c/source/test/intltest/messageformat2test_utils.h
index c4ad251c7f48..563c8b70cc7a 100644
--- a/icu4c/source/test/intltest/messageformat2test_utils.h
+++ b/icu4c/source/test/intltest/messageformat2test_utils.h
@@ -252,7 +252,8 @@ class TestUtils {
         if (!roundTrip(in, mf.getDataModel(), out)
             // For now, don't round-trip messages with these errors,
             // since duplicate options are dropped
-            && testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR) {
+            && (testCase.expectSuccess() ||
+                testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR)) {
             failRoundTrip(tmsg, testCase, in, out);
         }
 
diff --git a/testdata/message2/bidi.json b/testdata/message2/bidi.json
new file mode 100644
index 000000000000..0786124b3a54
--- /dev/null
+++ b/testdata/message2/bidi.json
@@ -0,0 +1,160 @@
+{
+  "scenario": "Bidi support",
+  "description": "Tests for correct parsing of messages with bidirectional marks and isolates",
+  "defaultTestProperties": {
+    "locale": "en-US"
+  },
+  "tests": [
+    {
+        "comment": "simple-message    = o [simple-start pattern]",
+        "src": "  \u061C Hello world!",
+        "exp": "  \u061C Hello world!"
+    },
+    {
+        "comment": "complex-message   = o *(declaration o) complex-body o",
+        "src": "\u200E .local $x = {1} {{ {$x}}}",
+        "exp": " 1"
+    },
+    {
+        "comment": "complex-message   = o *(declaration o) complex-body o",
+        "src": ".local $x = {1} \u200F {{ {$x}}}",
+        "exp": " 1"
+    },
+    {
+        "comment": "complex-message   = o *(declaration o) complex-body o",
+        "src": ".local $x = {1} {{ {$x}}} \u2066",
+        "exp": "1"
+    },
+    {
+        "comment": "input-declaration = input o variable-expression",
+        "src": ".input \u2067 {$x :number} {{hello}}",
+        "params": [{"name": "x", "value": "1"}],
+        "exp": "hello"
+    },
+    {
+        "comment": "local s variable o \"=\" o expression",
+        "src": ".local $x \u2068 = \u2069 {1} {{hello}}",
+        "exp": "hello"
+    },
+    {
+        "comment": "local s variable o \"=\" o expression",
+        "src": ".local \u2067 $x = {1} {{hello}}",
+        "exp": "hello"
+    },
+    {
+        "comment": "local s variable o \"=\" o expression",
+        "src": ".local\u2067 $x = {1} {{hello}}",
+        "exp": "hello"
+    },
+    {
+        "comment": "o \"{{\" pattern \"}}\"",
+        "src": "\u2067 {{hello}}",
+        "exp": "hello"
+    },
+    {
+        "comment": "match-statement s variant *(o variant)",
+        "src": [".local $x = {1 :number}\n",
+                ".match {$x}\n",
+                "1 {{one}}\n",
+                "\u061C * {{other}}"],
+        "exp": "one"
+    },
+    {
+        "comment": "match-statement s variant *(o variant)",
+        "src": [".local $x = {1 :number}",
+                ".match {$x} \u061c",
+                "1 {{one}}",
+                "* {{other}}"],
+        "exp": "one"
+    },
+    {
+        "comment": "match-statement s variant *(o variant)",
+        "src": [".local $x = {1 :number}",
+                ".match {$x}\u061c",
+                "1 {{one}}",
+                "* {{other}}"],
+        "exp": "one"
+    },
+    {
+        "comment": "variant = key *(s key) quoted-pattern",
+        "src": [".local $x = {1 :number} .local $y = {$x :number}",
+                ".match {$x} {$y}",
+                "1 \u200E 1 {{one}}",
+                "* * {{other}}"],
+        "exp": "one"
+    },
+    {
+        "comment": "variant = key *(s key) quoted-pattern",
+        "src": [".local $x = {1 :number} .local $y = {$x :number}",
+                ".match {$x} {$y}",
+                "1\u200E 1 {{one}}",
+                "* * {{other}}"],
+        "exp": "one"
+    },
+    {
+        "comment": "literal-expression  = \"{\" o literal [s function] *(s attribute) o \"}\"",
+        "src": "{\u200E hello \u200F}",
+        "exp": "hello"
+    },
+    {
+        "comment": "variable-expression = \"{\" o variable [s function] *(s attribute) o \"}\"",
+        "src": ".local $x = {1} {{ {\u200E $x \u200F} }}",
+        "exp": "1"
+    },
+    {
+        "comment": "function-expression = \"{\" o function *(s attribute) o \"}\"",
+        "src": "{1 \u200E :number \u200F}",
+        "exp": "1"
+    },
+    {
+        "comment": "markup = \"{\" o \"#\" identifier *(s option) *(s attribute) o [\"/\"] \"}\"",
+        "src": "{\u200F #b \u200E }",
+        "exp": ""
+    },
+    {
+        "comment": "markup = \"{\" o \"/\" identifier *(s option) *(s attribute) o \"}\"",
+        "src": "{\u200F /b \u200E }",
+        "exp": ""
+    },
+    {
+        "comment": "option = identifier o \"=\" o (literal / variable)",
+        "src": "{1 :number minimumFractionDigits\u200F=\u200E1 }",
+        "exp": "1.1"
+    },
+    {
+        "comment": "attribute      = \"@\" identifier [o \"=\" o (literal / variable)]",
+        "src": "{1 :number @locale\u200F=\u200Een }",
+        "exp": "1"
+    },
+    {
+        "comment":  " name... excludes U+FFFD and U+061C -- this pases as name -> [bidi] name-start *name-char",
+        "src": ".local $\u061Cfoo = {1} {{ {$\u061Cfoo} }}",
+        "exp": " 1 "
+    },
+    {
+        "comment":  " name matches https://www.w3.org/TR/REC-xml-names/#NT-NCName but excludes U+FFFD and U+061C",
+        "src": ".local $foo\u061Cbar = {2} {{ }}",
+        "expErrors": [{"type": "syntax-error"}]
+    },
+    {
+        "comment":  "name       = [bidi] name-start *name-char [bidi]",
+        "src": ".local $\u200Efoo\u200F = {3} {{$\u200Efoo\u200F}}",
+        "exp": "3"
+    },
+    {
+        "comment":  "name       = [bidi] name-start *name-char [bidi]",
+        "src": ".local $foo = {4} {{$\u200Efoo\u200F}}",
+        "exp": "4"
+    },
+    {
+        "comment":  "name       = [bidi] name-start *name-char [bidi]",
+        "src": ".local $\u200Efoo\u200F = {4} {{$foo}}",
+        "exp": "5"
+    },
+    {
+        "comment":  "name       = [bidi] name-start *name-char [bidi]",
+        "src": ".local $foo\u200Ebar = {5} {{$foo\u200Ebar}}",
+        "expErrors": [{"type": "syntax-error"}]
+    }
+  ]
+}