commit ec63f8d2fa89df3df5221647f60333801472ff11 Author: Brendan Jones Date: Tue Nov 20 17:26:15 2018 +0100 Correct json utf-8 literal parsing - see https://github.com/Microsoft/cpprestsdk/issues/139 diff --git a/Release/src/json/json_parsing.cpp b/Release/src/json/json_parsing.cpp index 13ecb543..ef1cb52d 100644 --- a/Release/src/json/json_parsing.cpp +++ b/Release/src/json/json_parsing.cpp @@ -738,36 +738,82 @@ inline bool JSON_Parser::handle_unescape_char(Token &token) return true; case 'u': { - // A four-hexdigit Unicode character. - // Transform into a 16 bit code point. - int decoded = 0; - for (int i = 0; i < 4; ++i) - { - ch = NextCharacter(); - int ch_int = static_cast(ch); - if (ch_int < 0 || ch_int > 127) - return false; + // NI patch taken from https://github.com/Microsoft/cpprestsdk/commit/c19e39249667771514b482adcbca1af8ff7ad311 + // see Release//src/utilities/asyncrt_utils.cpp for the defines + auto decode_utf16_unit = [](JSON_Parser& parser, json_error& ec) { + // A four-hexdigit Unicode character. + // Transform into a 16 bit code point. +#define L_SURROGATE_START 0xDC00 +#define L_SURROGATE_END 0xDFFF +#define H_SURROGATE_START 0xD800 +#define H_SURROGATE_END 0xDBFF + int decoded = 0; + for (int i = 0; i < 4; ++i) { + int ch_int = parser.NextCharacter(); + if (ch_int < 0 || ch_int > 127) { + ec = json_error::malformed_string_literal; + return 0; + } #ifdef _WIN32 - const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale()); + const int isxdigitResult = + _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale()); #else - const int isxdigitResult = isxdigit(ch_int); + const int isxdigitResult = isxdigit(ch_int); #endif - if (!isxdigitResult) - return false; + if (!isxdigitResult) { + ec = json_error::malformed_string_literal; + return 0; + } - int val = _hexval[static_cast(ch_int)]; - _ASSERTE(val != -1); + int val = _hexval[static_cast(ch_int)]; + _ASSERTE(val != -1); - // Add the input char to the decoded number - decoded |= (val << (4 * (3 - i))); - } + // Add the input char to the decoded number + decoded |= (val << (4 * (3 - i))); + } + + return decoded; + }; // Construct the character based on the decoded number - convert_append_unicode_code_unit(token, static_cast(decoded)); + // Convert the code unit into a UTF-8 sequence + utf16string utf16; + json_error ec{}; + auto decoded = decode_utf16_unit(*this, ec); + if (ec) + return false; + utf16.push_back(static_cast(decoded)); - return true; + if ( (decoded >= H_SURROGATE_START) && (decoded <= H_SURROGATE_END ) ) { + // Decoded a high surrogate. Attempt to grab low surrogate. + if (NextCharacter() != '\\') { + SetErrorCode(token, json_error::malformed_string_literal); + return false; + } + if (NextCharacter() != 'u') { + SetErrorCode(token, json_error::malformed_string_literal); + return false; + } + decoded = decode_utf16_unit(*this, ec); + if (ec) + return false; + utf16.push_back(static_cast(decoded)); + } + + try { + utf8string utf8; + utf8 = ::utility::conversions::utf16_to_utf8(utf16); + token.string_val.append(utf8); + return true; + } + catch (...) { + SetErrorCode(token, json_error::malformed_string_literal); + return false; + } } default: + // BUG: This is incorrect behavior; all characters MAY be escaped, and + // should be added as-is. return false; } }