diff --git a/Release/src/json/json_parsing.cpp b/Release/src/json/json_parsing.cpp index 6de65381f1..2af0ed1629 100644 --- a/Release/src/json/json_parsing.cpp +++ b/Release/src/json/json_parsing.cpp @@ -139,6 +139,7 @@ class JSON_Parser virtual bool CompleteComment(Token& token); virtual bool CompleteStringLiteral(Token& token); + int convert_unicode_to_code_point(); bool handle_unescape_char(Token& token); private: @@ -652,7 +653,15 @@ bool JSON_StringParser::CompleteComment(typename JSON_Parser return true; } -void convert_append_unicode_code_unit(JSON_Parser::Token& token, utf16char value) +void convert_append_unicode_code_unit(JSON_Parser::Token& token, utf16string value) +{ + token.string_val.append(value); +} +void convert_append_unicode_code_unit(JSON_Parser::Token& token, utf16string value) +{ + token.string_val.append(::utility::conversions::utf16_to_utf8(value)); +} +void convert_append_unicode_code_unit(JSON_Parser::Token& token, utf16char value) { token.string_val.push_back(value); } @@ -662,6 +671,37 @@ void convert_append_unicode_code_unit(JSON_Parser::Token& token, utf16char token.string_val.append(::utility::conversions::utf16_to_utf8(utf16)); } +template +int JSON_Parser::convert_unicode_to_code_point() +{ + // A four-hexdigit Unicode character. + // Transform into a 16 bit code point. + int decoded = 0; + for (int i = 0; i < 4; ++i) + { + auto ch = NextCharacter(); + int ch_int = static_cast(ch); + if (ch_int < 0 || ch_int > 127) return -1; +#ifdef _WIN32 + const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale()); +#else + const int isxdigitResult = isxdigit(ch_int); +#endif + if (!isxdigitResult) return -1; + + int val = _hexval[static_cast(ch_int)]; + + _ASSERTE(val != -1); + + // Add the input char to the decoded number + decoded |= (val << (4 * (3 - i))); + } + return decoded; +} + +#define H_SURROGATE_START 0xD800 +#define H_SURROGATE_END 0xDBFF + template inline bool JSON_Parser::handle_unescape_char(Token& token) { @@ -682,26 +722,31 @@ inline bool JSON_Parser::handle_unescape_char(Token& token) case 't': token.string_val.push_back('\t'); return true; case 'u': { - // A four-hexdigit Unicode character. - // Transform into a 16 bit code point. - int decoded = 0; - for (int i = 0; i < 4; ++i) + int decoded = convert_unicode_to_code_point(); + if (decoded == -1) { - ch = NextCharacter(); - int ch_int = static_cast(ch); - if (ch_int < 0 || ch_int > 127) return false; -#ifdef _WIN32 - const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale()); -#else - const int isxdigitResult = isxdigit(ch_int); -#endif - if (!isxdigitResult) return false; + return false; + } + + // handle multi-block characters that start with a high-surrogate + if (decoded >= H_SURROGATE_START && decoded <= H_SURROGATE_END) + { + // skip escape character '\u' + if (NextCharacter() != '\\' || NextCharacter() != 'u') + { + return false; + } + int decoded2 = convert_unicode_to_code_point(); + + if (decoded2 == -1) + { + return false; + } - int val = _hexval[static_cast(ch_int)]; - _ASSERTE(val != -1); + utf16string compoundUTF16 = {static_cast(decoded), static_cast(decoded2)}; + convert_append_unicode_code_unit(token, compoundUTF16); - // Add the input char to the decoded number - decoded |= (val << (4 * (3 - i))); + return true; } // Construct the character based on the decoded number @@ -1015,9 +1060,13 @@ std::unique_ptr JSON_Parser::_ParseValue( { switch (tkn.kind) { - case JSON_Parser::Token::TKN_OpenBrace: { return _ParseObject(tkn); + case JSON_Parser::Token::TKN_OpenBrace: + { + return _ParseObject(tkn); } - case JSON_Parser::Token::TKN_OpenBracket: { return _ParseArray(tkn); + case JSON_Parser::Token::TKN_OpenBracket: + { + return _ParseArray(tkn); } case JSON_Parser::Token::TKN_StringLiteral: { diff --git a/Release/tests/functional/json/parsing_tests.cpp b/Release/tests/functional/json/parsing_tests.cpp index f235893af2..fc7a23779c 100644 --- a/Release/tests/functional/json/parsing_tests.cpp +++ b/Release/tests/functional/json/parsing_tests.cpp @@ -159,7 +159,7 @@ SUITE(parsing_tests) input.append(2, ch); json::value val = json::value::parse(input); VERIFY_IS_TRUE(val.is_object()); - VERIFY_ARE_EQUAL(U("2"), val[U("1"]).serialize()); + VERIFY_ARE_EQUAL(U("2"), val[U("1")].serialize()); } } @@ -213,6 +213,12 @@ SUITE(parsing_tests) const auto euro = to_string_t("\xE2\x82\xAC"); VERIFY_ARE_EQUAL(euro, str.as_string()); + // UTF-16 character with surrogate pair + str = json::value::parse(U("\"\\ud83d\\ude00\"")); + // Grinning Face emoji as a hexadecimal UTF-8 + const auto emoji = to_string_t("\xF0\x9F\x98\x80"); + VERIFY_ARE_EQUAL(emoji, str.as_string()); + VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\""))); }