Skip to content

Commit

Permalink
Merge pull request microsoft#10 in CR/cpprestsdk from fix/emoji_parsi…
Browse files Browse the repository at this point in the history
…ng_in_json to master

* commit '6c927f0c65700b535f932037a64a74910cd10e0e':
  fix json parsing of surrogate-pair utf16 characters
  add multi-block utf16 symbol parsing test
  • Loading branch information
Tymolc committed Jun 25, 2018
2 parents 9f61d38 + 6c927f0 commit fbdba53
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 25 deletions.
71 changes: 49 additions & 22 deletions Release/src/json/json_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class JSON_Parser

virtual bool CompleteComment(Token &token);
virtual bool CompleteStringLiteral(Token &token);
int convert_unicode_to_code_point(Token &token);
bool handle_unescape_char(Token &token);

private:
Expand Down Expand Up @@ -702,6 +703,36 @@ void convert_append_unicode_code_unit(JSON_Parser<char>::Token &token, utf16char
token.string_val.append(::utility::conversions::utf16_to_utf8(utf16));
}

template <typename CharType>
int JSON_Parser<CharType>::convert_unicode_to_code_point(Token &token)
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
{
auto ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127)
return -1;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult)
return -1;

int val = _hexval[static_cast<size_t>(ch_int)];

_ASSERTE(val != -1);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
}
return decoded;
}

template <typename CharType>
inline bool JSON_Parser<CharType>::handle_unescape_char(Token &token)
{
Expand Down Expand Up @@ -738,32 +769,28 @@ inline bool JSON_Parser<CharType>::handle_unescape_char(Token &token)
return true;
case 'u':
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
int decoded = convert_unicode_to_code_point(token);
if (decoded == -1)
{
ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127)
return false;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult)
return false;
return false;
}

int val = _hexval[static_cast<size_t>(ch_int)];
_ASSERTE(val != -1);
// handle multi-block characters that start with a high-surrogate
if (decoded > 55296 && decoded < 56319)
{
// skip escape character
NextCharacter(); NextCharacter();
int decoded2 = convert_unicode_to_code_point(token);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
utf16string compoundUTF16 = { static_cast<utf16char>(decoded),
static_cast<utf16char>(decoded2) };
token.string_val.append(::utility::conversions::utf16_to_utf8(compoundUTF16));
}
else
{
// Construct the character based on the decoded number
convert_append_unicode_code_unit(token, static_cast<utf16char>(decoded));
}

// Construct the character based on the decoded number
convert_append_unicode_code_unit(token, static_cast<utf16char>(decoded));

return true;
}
Expand Down
12 changes: 9 additions & 3 deletions Release/tests/functional/json/parsing_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ TEST(whitespace_object)
input.append(2, ch);
json::value val = json::value::parse(input);
VERIFY_IS_TRUE(val.is_object());
VERIFY_ARE_EQUAL(U("2"), val[U("1"]).serialize());
VERIFY_ARE_EQUAL(U("2"), val[U("1")].serialize());
}
}

Expand Down Expand Up @@ -208,6 +208,12 @@ TEST(escaped_unicode_string)
const auto euro = to_string_t("\xE2\x82\xAC");
VERIFY_ARE_EQUAL(euro, str.as_string());

// UTF-16 character with surrogate pair
str = json::value::parse(U("\"\\ud83d\\ude00\""));
// Grinning Face emoji as a hexadecimal UTF-8
const auto emoji = to_string_t("\xF0\x9F\x98\x80");
VERIFY_ARE_EQUAL(emoji, str.as_string());

VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
}

Expand Down Expand Up @@ -651,7 +657,7 @@ TEST(non_default_locale, "Ignore:Android", "Locale unsupported on Android")

setlocale(LC_ALL, originalLocale.c_str());
setlocale(LC_NUMERIC, changedLocale.c_str());

// cpprestsdk stream serialize
utility::stringstream_t stream;
stream << v;
Expand Down Expand Up @@ -731,7 +737,7 @@ TEST(parse_overload_failed)

utility::stringstream_t stream;
stream << str;

parsedObject = json::value::parse(arrStr, streamErr);
VERIFY_IS_TRUE(streamErr.value() > 0);
VERIFY_IS_TRUE(parsedObject.is_null());
Expand Down

0 comments on commit fbdba53

Please sign in to comment.