Skip to content

Commit

Permalink
Handle multi-byte unicode characters in json parsing (#1023)
Browse files Browse the repository at this point in the history
* handle multi-byte unicode characters in json parsing

* Properly check high surrogate start and end

Co-Authored-By: Tymolc <[email protected]>
  • Loading branch information
Tymolc authored and BillyONeal committed Jan 28, 2019
1 parent 8f0393d commit 5a885dd
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 21 deletions.
89 changes: 69 additions & 20 deletions Release/src/json/json_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class JSON_Parser

virtual bool CompleteComment(Token& token);
virtual bool CompleteStringLiteral(Token& token);
int convert_unicode_to_code_point();
bool handle_unescape_char(Token& token);

private:
Expand Down Expand Up @@ -652,7 +653,15 @@ bool JSON_StringParser<CharType>::CompleteComment(typename JSON_Parser<CharType>
return true;
}

void convert_append_unicode_code_unit(JSON_Parser<wchar_t>::Token& token, utf16char value)
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16string value)
{
token.string_val.append(value);
}
void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16string value)
{
token.string_val.append(::utility::conversions::utf16_to_utf8(value));
}
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16char value)
{
token.string_val.push_back(value);
}
Expand All @@ -662,6 +671,37 @@ void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16char
token.string_val.append(::utility::conversions::utf16_to_utf8(utf16));
}

template<typename CharType>
int JSON_Parser<CharType>::convert_unicode_to_code_point()
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
{
auto ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127) return -1;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult) return -1;

int val = _hexval[static_cast<size_t>(ch_int)];

_ASSERTE(val != -1);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
}
return decoded;
}

#define H_SURROGATE_START 0xD800
#define H_SURROGATE_END 0xDBFF

template<typename CharType>
inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
{
Expand All @@ -682,26 +722,31 @@ inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
case 't': token.string_val.push_back('\t'); return true;
case 'u':
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
int decoded = convert_unicode_to_code_point();
if (decoded == -1)
{
ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127) return false;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult) return false;
return false;
}

// handle multi-block characters that start with a high-surrogate
if (decoded >= H_SURROGATE_START && decoded <= H_SURROGATE_END)
{
// skip escape character '\u'
if (NextCharacter() != '\\' || NextCharacter() != 'u')
{
return false;
}
int decoded2 = convert_unicode_to_code_point();

if (decoded2 == -1)
{
return false;
}

int val = _hexval[static_cast<size_t>(ch_int)];
_ASSERTE(val != -1);
utf16string compoundUTF16 = {static_cast<utf16char>(decoded), static_cast<utf16char>(decoded2)};
convert_append_unicode_code_unit(token, compoundUTF16);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
return true;
}

// Construct the character based on the decoded number
Expand Down Expand Up @@ -1015,9 +1060,13 @@ std::unique_ptr<web::json::details::_Value> JSON_Parser<CharType>::_ParseValue(
{
switch (tkn.kind)
{
case JSON_Parser<CharType>::Token::TKN_OpenBrace: { return _ParseObject(tkn);
case JSON_Parser<CharType>::Token::TKN_OpenBrace:
{
return _ParseObject(tkn);
}
case JSON_Parser<CharType>::Token::TKN_OpenBracket: { return _ParseArray(tkn);
case JSON_Parser<CharType>::Token::TKN_OpenBracket:
{
return _ParseArray(tkn);
}
case JSON_Parser<CharType>::Token::TKN_StringLiteral:
{
Expand Down
8 changes: 7 additions & 1 deletion Release/tests/functional/json/parsing_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ SUITE(parsing_tests)
input.append(2, ch);
json::value val = json::value::parse(input);
VERIFY_IS_TRUE(val.is_object());
VERIFY_ARE_EQUAL(U("2"), val[U("1"]).serialize());
VERIFY_ARE_EQUAL(U("2"), val[U("1")].serialize());
}
}

Expand Down Expand Up @@ -213,6 +213,12 @@ SUITE(parsing_tests)
const auto euro = to_string_t("\xE2\x82\xAC");
VERIFY_ARE_EQUAL(euro, str.as_string());

// UTF-16 character with surrogate pair
str = json::value::parse(U("\"\\ud83d\\ude00\""));
// Grinning Face emoji as a hexadecimal UTF-8
const auto emoji = to_string_t("\xF0\x9F\x98\x80");
VERIFY_ARE_EQUAL(emoji, str.as_string());

VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
}

Expand Down

0 comments on commit 5a885dd

Please sign in to comment.