Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle multi-byte unicode characters in json parsing #1023

Merged
merged 2 commits into from
Jan 28, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 69 additions & 20 deletions Release/src/json/json_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class JSON_Parser

virtual bool CompleteComment(Token& token);
virtual bool CompleteStringLiteral(Token& token);
int convert_unicode_to_code_point();
bool handle_unescape_char(Token& token);

private:
Expand Down Expand Up @@ -652,7 +653,15 @@ bool JSON_StringParser<CharType>::CompleteComment(typename JSON_Parser<CharType>
return true;
}

void convert_append_unicode_code_unit(JSON_Parser<wchar_t>::Token& token, utf16char value)
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16string value)
{
token.string_val.append(value);
}
void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16string value)
{
token.string_val.append(::utility::conversions::utf16_to_utf8(value));
}
void convert_append_unicode_code_unit(JSON_Parser<utf16char>::Token& token, utf16char value)
{
token.string_val.push_back(value);
}
Expand All @@ -662,6 +671,37 @@ void convert_append_unicode_code_unit(JSON_Parser<char>::Token& token, utf16char
token.string_val.append(::utility::conversions::utf16_to_utf8(utf16));
}

template<typename CharType>
int JSON_Parser<CharType>::convert_unicode_to_code_point()
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
{
auto ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127) return -1;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult) return -1;

int val = _hexval[static_cast<size_t>(ch_int)];

_ASSERTE(val != -1);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
}
return decoded;
}

#define H_SURROGATE_START 0xD800
#define H_SURROGATE_END 0xDBFF

template<typename CharType>
inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
{
Expand All @@ -682,26 +722,31 @@ inline bool JSON_Parser<CharType>::handle_unescape_char(Token& token)
case 't': token.string_val.push_back('\t'); return true;
case 'u':
{
// A four-hexdigit Unicode character.
// Transform into a 16 bit code point.
int decoded = 0;
for (int i = 0; i < 4; ++i)
int decoded = convert_unicode_to_code_point();
if (decoded == -1)
{
ch = NextCharacter();
int ch_int = static_cast<int>(ch);
if (ch_int < 0 || ch_int > 127) return false;
#ifdef _WIN32
const int isxdigitResult = _isxdigit_l(ch_int, utility::details::scoped_c_thread_locale::c_locale());
#else
const int isxdigitResult = isxdigit(ch_int);
#endif
if (!isxdigitResult) return false;
return false;
}

// handle multi-block characters that start with a high-surrogate
if (decoded > H_SURROGATE_START && decoded < H_SURROGATE_END)
Tymolc marked this conversation as resolved.
Show resolved Hide resolved
{
// skip escape character '\u'
if (NextCharacter() != '\\' || NextCharacter() != 'u')
{
return false;
}
int decoded2 = convert_unicode_to_code_point();

if (decoded2 == -1)
{
return false;
}

int val = _hexval[static_cast<size_t>(ch_int)];
_ASSERTE(val != -1);
utf16string compoundUTF16 = {static_cast<utf16char>(decoded), static_cast<utf16char>(decoded2)};
convert_append_unicode_code_unit(token, compoundUTF16);

// Add the input char to the decoded number
decoded |= (val << (4 * (3 - i)));
return true;
}

// Construct the character based on the decoded number
Expand Down Expand Up @@ -1015,9 +1060,13 @@ std::unique_ptr<web::json::details::_Value> JSON_Parser<CharType>::_ParseValue(
{
switch (tkn.kind)
{
case JSON_Parser<CharType>::Token::TKN_OpenBrace: { return _ParseObject(tkn);
case JSON_Parser<CharType>::Token::TKN_OpenBrace:
{
return _ParseObject(tkn);
}
case JSON_Parser<CharType>::Token::TKN_OpenBracket: { return _ParseArray(tkn);
case JSON_Parser<CharType>::Token::TKN_OpenBracket:
{
return _ParseArray(tkn);
}
case JSON_Parser<CharType>::Token::TKN_StringLiteral:
{
Expand Down
8 changes: 7 additions & 1 deletion Release/tests/functional/json/parsing_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ SUITE(parsing_tests)
input.append(2, ch);
json::value val = json::value::parse(input);
VERIFY_IS_TRUE(val.is_object());
VERIFY_ARE_EQUAL(U("2"), val[U("1"]).serialize());
VERIFY_ARE_EQUAL(U("2"), val[U("1")].serialize());
}
}

Expand Down Expand Up @@ -213,6 +213,12 @@ SUITE(parsing_tests)
const auto euro = to_string_t("\xE2\x82\xAC");
VERIFY_ARE_EQUAL(euro, str.as_string());

// UTF-16 character with surrogate pair
str = json::value::parse(U("\"\\ud83d\\ude00\""));
// Grinning Face emoji as a hexadecimal UTF-8
const auto emoji = to_string_t("\xF0\x9F\x98\x80");
VERIFY_ARE_EQUAL(emoji, str.as_string());

VERIFY_PARSING_THROW(json::value::parse(U("\"\\u0klB\"")));
}

Expand Down