Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode escaping #22

Merged
merged 7 commits into from
Jan 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .idea/codeStyleSettings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/json.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ project(json)
# Enable C++11 and set flags for coverage testing
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O0 --coverage -fprofile-arcs -ftest-coverage")

# Make everything public for testing purposes
add_definitions(-Dprivate=public)

# If not specified, use Debug as build type (necessary for coverage testing)
if( NOT CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Debug CACHE STRING
Expand Down
222 changes: 214 additions & 8 deletions src/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2052,23 +2052,30 @@ std::string json::parser::parseString()
// the result of the parse process
std::string result;

// iterate with pos_ over the whole string
// iterate with pos_ over the whole input until we found the end and return
// or we exit via error()
for (; pos_ < buffer_.size(); pos_++)
{
char currentChar = buffer_[pos_];

// uneven amount of backslashes means the user wants to escape something
if (!evenAmountOfBackslashes)
{
// uneven amount of backslashes means the user wants to escape
// something so we know there is a case such as '\X' or '\\\X' but
// we don't know yet what X is.
// at this point in the code, the currentChar has the value of X.

// slash, backslash and quote are copied as is
if (currentChar == '/' or currentChar == '\\' or currentChar == '"')
if ( currentChar == '/'
|| currentChar == '\\'
|| currentChar == '"')
{
result += currentChar;
}
else
{
// all other characters are replaced by their respective
// special character
// all other characters are replaced by their respective special
// character
switch (currentChar)
{
case 't':
Expand Down Expand Up @@ -2096,12 +2103,26 @@ std::string json::parser::parseString()
result += '\r';
break;
}
case 'u':
{
// \uXXXX[\uXXXX] is used for escaping unicode, which
// has it's own subroutine.
result += parseUnicodeEscape();
// the parsing process has brought us one step behind
// the unicode escape sequence:
// \uXXXX
// ^
// we need to go one character back or the parser would
// skip the character we are currently pointing at as
// the for-loop will decrement pos_ after this iteration
pos_--;
break;
}
default:
{
error("expected one of \\, /, b, f, n, r, t behind backslash.");
error("expected one of \\, /, b, f, n, r, t, u behind backslash.");
}
}
// TODO implement \uXXXX
}
}
else
Expand All @@ -2120,7 +2141,7 @@ std::string json::parser::parseString()
}
else if (currentChar != '\\')
{
// All non-backslash characters are added to the end of the
// all non-backslash characters are added to the end of the
// result string. The only backslashes we want in the result
// are the ones that are escaped (which happens above).
result += currentChar;
Expand Down Expand Up @@ -2148,6 +2169,191 @@ std::string json::parser::parseString()
error("expected '\"'");
}



/*!
Turns a code point into it's UTF-8 representation.
You should only pass numbers < 0x10ffff into this function
(everything else is a invalid code point).

@return the UTF-8 representation of the given code point

@pre This method isn't accessing the members of the parser

@post This method isn't accessing the members of the parser
*/
std::string json::parser::codePointToUTF8(unsigned int codePoint)
{
// this method contains a lot of bit manipulations to
// build the bytes for UTF-8.

// the '(... >> S) & 0xHH'-patterns are used to retrieve
// certain bits from the code points.

// all static casts in this method have boundary checks

// we initialize all strings with their final length
// (e.g. 1 to 4 bytes) to save the reallocations.


if (codePoint <= 0x7f)
{
// it's just a ASCII compatible codePoint,
// so we just interpret the point as a character
// and return ASCII

return std::string(1, static_cast<char>(codePoint));
}
// if true, we need two bytes to encode this as UTF-8
else if (codePoint <= 0x7ff)
{
// the 0xC0 enables the two most significant two bits
// to make this a two-byte UTF-8 character.
std::string result(2, static_cast<char>(0xC0 | ((codePoint >> 6) & 0x1F)));
result[1] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
// if true, now we need three bytes to encode this as UTF-8
else if (codePoint <= 0xffff)
{
// the 0xE0 enables the three most significant two bits
// to make this a three-byte UTF-8 character.
std::string result(3, static_cast<char>(0xE0 | ((codePoint >> 12) & 0x0F)));
result[1] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
result[2] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
// if true, we need maximal four bytes to encode this as UTF-8
else if (codePoint <= 0x10ffff)
{
// the 0xE0 enables the four most significant two bits
// to make this a three-byte UTF-8 character.
std::string result(4, static_cast<char>(0xF0 | ((codePoint >> 18) & 0x07)));
result[1] = static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F));
result[2] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
result[3] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
else
{
// Can't be tested without direct access to this private method.
std::string errorMessage = "Invalid codePoint: ";
errorMessage += codePoint;
error(errorMessage);
}
}

/*!
Parses 4 hexadecimal characters as a number.

@return the value of the number the hexadecimal characters represent.

@pre pos_ is pointing to the first of the 4 hexadecimal characters.

@post pos_ is pointing to the character after the 4 hexadecimal characters.
*/
unsigned int json::parser::parse4HexCodePoint()
{
const auto startPos = pos_;

// check if the remaining buffer is long enough to even hold 4 characters
if (pos_ + 3 >= buffer_.size())
{
error("Got end of input while parsing unicode escape sequence \\uXXXX");
}

// make a string that can hold the pair
std::string hexCode(4, ' ');

for(; pos_ < startPos + 4; pos_++)
{
// no boundary check here as we already checked above
char currentChar = buffer_[pos_];

// check if we have a hexadecimal character
if ( (currentChar >= '0' && currentChar <= '9')
|| (currentChar >= 'a' && currentChar <= 'f')
|| (currentChar >= 'A' && currentChar <= 'F'))
{
// all is well, we have valid hexadecimal chars
// so we copy that char into our string
hexCode[pos_ - startPos] = currentChar;
}
else
{
error("Found non-hexadecimal character in unicode escape sequence!");
}
}
// the cast is safe as 4 hex characters can't present more than 16 bits
// the input to stoul was checked to contain only hexadecimal characters
// (see above)
return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
}

/*!
Parses the unicode escape codes as defined in the ECMA-404.
The escape sequence has two forms:
1. \uXXXX
2. \uXXXX\uYYYY
where X and Y are a hexadecimal character (a-zA-Z0-9).

Form 1 just contains the unicode code point in the hexadecimal number XXXX.
Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low
surrogate is YYYY.

@return the UTF-8 character this unicode escape sequence escaped.

@pre pos_ is pointing at at the 'u' behind the first backslash.

@post pos_ is pointing at the character behind the last X (or Y in form 2).
*/
std::string json::parser::parseUnicodeEscape()
{
// jump to the first hex value
pos_++;
// parse the hex first hex values
unsigned int firstCodePoint = parse4HexCodePoint();


if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF)
{
// we found invalid code points, which means we either have a malformed
// input or we found a high surrogate.
// we can only find out by seeing if the next character also wants to
// encode a unicode character (so, we have the \uXXXX\uXXXX case here).

// jump behind the next \u
pos_ += 2;
// try to parse the next hex values.
// the method does boundary checking for us, so no need to do that here
unsigned secondCodePoint = parse4HexCodePoint();
// ok, we have a low surrogate, check if it is a valid one
if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF)
{
// calculate the code point from the pair according to the spec
unsigned int finalCodePoint =
// high surrogate occupies the most significant 22 bits
(firstCodePoint << 10)
// low surrogate occupies the least significant 15 bits
+ secondCodePoint
// there is still the 0xD800, 0xDC00 and 0x10000 noise in
// the result
// so we have to substract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
- 0x35FDC00;

// we transform the calculated point into UTF-8
return codePointToUTF8(finalCodePoint);
}
else
error("missing low surrogate");

}
// We have Form 1, so we just interpret the XXXX as a code point
return codePointToUTF8(firstCodePoint);
}


/*!
This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the
Expand Down
6 changes: 6 additions & 0 deletions src/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,12 @@ class json
inline void error(const std::string&) __attribute__((noreturn));
/// parse a quoted string
inline std::string parseString();
/// transforms a unicode codepoint to it's UTF-8 presentation
std::string codePointToUTF8(unsigned int codePoint);
/// parses 4 hex characters that represent a unicode code point
inline unsigned int parse4HexCodePoint();
/// parses \uXXXX[\uXXXX] unicode escape characters
inline std::string parseUnicodeEscape();
/// parse a Boolean "true"
inline void parseTrue();
/// parse a Boolean "false"
Expand Down
Loading