Skip to content

Commit

Permalink
small tidy up after pull request #22
Browse files Browse the repository at this point in the history
- removed IDE leftovers
- adjusted const’ness of some functions
- style guide
  • Loading branch information
nlohmann committed Jan 11, 2015
1 parent 6533b33 commit bd05c7a
Show file tree
Hide file tree
Showing 9 changed files with 247 additions and 107 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ CMakeFiles
libjson.a

Testing

.idea
35 changes: 0 additions & 35 deletions .idea/codeStyleSettings.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/json.iml

This file was deleted.

5 changes: 0 additions & 5 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

232 changes: 219 additions & 13 deletions header_only/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,9 +418,15 @@ class json
/// read the next character, stripping whitespace
bool next();
/// raise an exception with an error message
inline void error(const std::string&) __attribute__((noreturn));
inline void error(const std::string&) const __attribute__((noreturn));
/// parse a quoted string
inline std::string parseString();
/// transforms a unicode codepoint to it's UTF-8 presentation
std::string codePointToUTF8(unsigned int codePoint) const;
/// parses 4 hex characters that represent a unicode code point
inline unsigned int parse4HexCodePoint();
/// parses \uXXXX[\uXXXX] unicode escape characters
inline std::string parseUnicodeEscape();
/// parse a Boolean "true"
inline void parseTrue();
/// parse a Boolean "false"
Expand Down Expand Up @@ -2434,9 +2440,9 @@ json json::parser::parse()
// remember position of number's first character
const auto _firstpos_ = pos_ - 1;

while (next() and (std::isdigit(current_) || current_ == '.'
|| current_ == 'e' || current_ == 'E'
|| current_ == '+' || current_ == '-'));
while (next() and (std::isdigit(current_) or current_ == '.'
or current_ == 'e' or current_ == 'E'
or current_ == '+' or current_ == '-'));

try
{
Expand Down Expand Up @@ -2512,7 +2518,7 @@ the error message \p msg), and the last read token.
@exception std::invalid_argument whenever the function is called
*/
void json::parser::error(const std::string& msg)
void json::parser::error(const std::string& msg) const
{
throw std::invalid_argument("parse error at position " +
std::to_string(pos_) + ": " + msg +
Expand Down Expand Up @@ -2542,23 +2548,28 @@ std::string json::parser::parseString()
// the result of the parse process
std::string result;

// iterate with pos_ over the whole string
// iterate with pos_ over the whole input until we found the end and return
// or we exit via error()
for (; pos_ < buffer_.size(); pos_++)
{
char currentChar = buffer_[pos_];

// uneven amount of backslashes means the user wants to escape something
if (!evenAmountOfBackslashes)
if (not evenAmountOfBackslashes)
{
// uneven amount of backslashes means the user wants to escape
// something so we know there is a case such as '\X' or '\\\X' but
// we don't know yet what X is.
// at this point in the code, the currentChar has the value of X.

// slash, backslash and quote are copied as is
if (currentChar == '/' or currentChar == '\\' or currentChar == '"')
{
result += currentChar;
}
else
{
// all other characters are replaced by their respective
// special character
// all other characters are replaced by their respective special
// character
switch (currentChar)
{
case 't':
Expand Down Expand Up @@ -2586,12 +2597,26 @@ std::string json::parser::parseString()
result += '\r';
break;
}
case 'u':
{
// \uXXXX[\uXXXX] is used for escaping unicode, which
// has it's own subroutine.
result += parseUnicodeEscape();
// the parsing process has brought us one step behind
// the unicode escape sequence:
// \uXXXX
// ^
// we need to go one character back or the parser would
// skip the character we are currently pointing at as
// the for-loop will decrement pos_ after this iteration
pos_--;
break;
}
default:
{
error("expected one of \\, /, b, f, n, r, t behind backslash.");
error("expected one of \\, /, b, f, n, r, t, u behind backslash.");
}
}
// TODO implement \uXXXX
}
}
else
Expand All @@ -2610,7 +2635,7 @@ std::string json::parser::parseString()
}
else if (currentChar != '\\')
{
// All non-backslash characters are added to the end of the
// all non-backslash characters are added to the end of the
// result string. The only backslashes we want in the result
// are the ones that are escaped (which happens above).
result += currentChar;
Expand Down Expand Up @@ -2638,6 +2663,187 @@ std::string json::parser::parseString()
error("expected '\"'");
}



/*!
Turns a code point into it's UTF-8 representation.
You should only pass numbers < 0x10ffff into this function
(everything else is a invalid code point).
@return the UTF-8 representation of the given code point
*/
std::string json::parser::codePointToUTF8(unsigned int codePoint) const
{
// this method contains a lot of bit manipulations to
// build the bytes for UTF-8.

// the '(... >> S) & 0xHH'-patterns are used to retrieve
// certain bits from the code points.

// all static casts in this method have boundary checks

// we initialize all strings with their final length
// (e.g. 1 to 4 bytes) to save the reallocations.

if (codePoint <= 0x7f)
{
// it's just a ASCII compatible codePoint,
// so we just interpret the point as a character
// and return ASCII

return std::string(1, static_cast<char>(codePoint));
}
// if true, we need two bytes to encode this as UTF-8
else if (codePoint <= 0x7ff)
{
// the 0xC0 enables the two most significant two bits
// to make this a two-byte UTF-8 character.
std::string result(2, static_cast<char>(0xC0 | ((codePoint >> 6) & 0x1F)));
result[1] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
// if true, now we need three bytes to encode this as UTF-8
else if (codePoint <= 0xffff)
{
// the 0xE0 enables the three most significant two bits
// to make this a three-byte UTF-8 character.
std::string result(3, static_cast<char>(0xE0 | ((codePoint >> 12) & 0x0F)));
result[1] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
result[2] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
// if true, we need maximal four bytes to encode this as UTF-8
else if (codePoint <= 0x10ffff)
{
// the 0xE0 enables the four most significant two bits
// to make this a three-byte UTF-8 character.
std::string result(4, static_cast<char>(0xF0 | ((codePoint >> 18) & 0x07)));
result[1] = static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F));
result[2] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
result[3] = static_cast<char>(0x80 | (codePoint & 0x3F));
return result;
}
else
{
// Can't be tested without direct access to this private method.
std::string errorMessage = "Invalid codePoint: ";
errorMessage += codePoint;
error(errorMessage);
}
}

/*!
Parses 4 hexadecimal characters as a number.
@return the value of the number the hexadecimal characters represent.
@pre pos_ is pointing to the first of the 4 hexadecimal characters.
@post pos_ is pointing to the character after the 4 hexadecimal characters.
*/
unsigned int json::parser::parse4HexCodePoint()
{
const auto startPos = pos_;

// check if the remaining buffer is long enough to even hold 4 characters
if (pos_ + 3 >= buffer_.size())
{
error("Got end of input while parsing unicode escape sequence \\uXXXX");
}

// make a string that can hold the pair
std::string hexCode(4, ' ');

for (; pos_ < startPos + 4; pos_++)
{
// no boundary check here as we already checked above
char currentChar = buffer_[pos_];

// check if we have a hexadecimal character
if ((currentChar >= '0' and currentChar <= '9')
or (currentChar >= 'a' and currentChar <= 'f')
or (currentChar >= 'A' and currentChar <= 'F'))
{
// all is well, we have valid hexadecimal chars
// so we copy that char into our string
hexCode[pos_ - startPos] = currentChar;
}
else
{
error("Found non-hexadecimal character in unicode escape sequence!");
}
}
// the cast is safe as 4 hex characters can't present more than 16 bits
// the input to stoul was checked to contain only hexadecimal characters
// (see above)
return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
}

/*!
Parses the unicode escape codes as defined in the ECMA-404.
The escape sequence has two forms:
1. \uXXXX
2. \uXXXX\uYYYY
where X and Y are a hexadecimal character (a-zA-Z0-9).
Form 1 just contains the unicode code point in the hexadecimal number XXXX.
Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low
surrogate is YYYY.
@return the UTF-8 character this unicode escape sequence escaped.
@pre pos_ is pointing at at the 'u' behind the first backslash.
@post pos_ is pointing at the character behind the last X (or Y in form 2).
*/
std::string json::parser::parseUnicodeEscape()
{
// jump to the first hex value
pos_++;
// parse the hex first hex values
unsigned int firstCodePoint = parse4HexCodePoint();

if (firstCodePoint >= 0xD800 and firstCodePoint <= 0xDBFF)
{
// we found invalid code points, which means we either have a malformed
// input or we found a high surrogate.
// we can only find out by seeing if the next character also wants to
// encode a unicode character (so, we have the \uXXXX\uXXXX case here).

// jump behind the next \u
pos_ += 2;
// try to parse the next hex values.
// the method does boundary checking for us, so no need to do that here
unsigned secondCodePoint = parse4HexCodePoint();
// ok, we have a low surrogate, check if it is a valid one
if (secondCodePoint >= 0xDC00 and secondCodePoint <= 0xDFFF)
{
// calculate the code point from the pair according to the spec
unsigned int finalCodePoint =
// high surrogate occupies the most significant 22 bits
(firstCodePoint << 10)
// low surrogate occupies the least significant 15 bits
+ secondCodePoint
// there is still the 0xD800, 0xDC00 and 0x10000 noise in
// the result
// so we have to substract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
- 0x35FDC00;

// we transform the calculated point into UTF-8
return codePointToUTF8(finalCodePoint);
}
else
{
error("missing low surrogate");
}

}
// We have Form 1, so we just interpret the XXXX as a code point
return codePointToUTF8(firstCodePoint);
}


/*!
This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the
Expand Down
Loading

0 comments on commit bd05c7a

Please sign in to comment.