small tidy up after pull request #22

- removed IDE leftovers - adjusted const’ness of some functions - style guide
nlohmann · Jan 11, 2015 · bd05c7a · bd05c7a
1 parent 6533b33
commit bd05c7a
Show file tree

Hide file tree

Showing 9 changed files with 247 additions and 107 deletions.
diff --git a/.gitignore b/.gitignore
@@ -46,3 +46,5 @@ CMakeFiles
 libjson.a
 
 Testing
+
+.idea
diff --git a/.idea/codeStyleSettings.xml b/.idea/codeStyleSettings.xml
diff --git a/.idea/json.iml b/.idea/json.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/header_only/json.h b/header_only/json.h
@@ -418,9 +418,15 @@ class json
         /// read the next character, stripping whitespace
         bool next();
         /// raise an exception with an error message
-        inline void error(const std::string&) __attribute__((noreturn));
+        inline void error(const std::string&) const __attribute__((noreturn));
         /// parse a quoted string
         inline std::string parseString();
+        /// transforms a unicode codepoint to it's UTF-8 presentation
+        std::string codePointToUTF8(unsigned int codePoint) const;
+        /// parses 4 hex characters that represent a unicode code point
+        inline unsigned int parse4HexCodePoint();
+        /// parses \uXXXX[\uXXXX] unicode escape characters
+        inline std::string parseUnicodeEscape();
         /// parse a Boolean "true"
         inline void parseTrue();
         /// parse a Boolean "false"
@@ -2434,9 +2440,9 @@ json json::parser::parse()
             // remember position of number's first character
             const auto _firstpos_ = pos_ - 1;
 
-            while (next() and (std::isdigit(current_) || current_ == '.'
-                               || current_ == 'e' || current_ == 'E'
-                               || current_ == '+' || current_ == '-'));
+            while (next() and (std::isdigit(current_) or current_ == '.'
+                               or current_ == 'e' or current_ == 'E'
+                               or current_ == '+' or current_ == '-'));
 
             try
             {
@@ -2512,7 +2518,7 @@ the error message \p msg), and the last read token.
 
 @exception std::invalid_argument whenever the function is called
 */
-void json::parser::error(const std::string& msg)
+void json::parser::error(const std::string& msg) const
 {
     throw std::invalid_argument("parse error at position " +
                                 std::to_string(pos_) + ": " + msg +
@@ -2542,23 +2548,28 @@ std::string json::parser::parseString()
     // the result of the parse process
     std::string result;
 
-    // iterate with pos_ over the whole string
+    // iterate with pos_ over the whole input until we found the end and return
+    // or we exit via error()
     for (; pos_ < buffer_.size(); pos_++)
     {
         char currentChar = buffer_[pos_];
 
-        // uneven amount of backslashes means the user wants to escape something
-        if (!evenAmountOfBackslashes)
+        if (not evenAmountOfBackslashes)
         {
+            // uneven amount of backslashes means the user wants to escape
+            // something so we know there is a case such as '\X' or '\\\X' but
+            // we don't know yet what X is.
+            // at this point in the code, the currentChar has the value of X.
+
             // slash, backslash and quote are copied as is
             if (currentChar == '/' or currentChar == '\\' or currentChar == '"')
             {
                 result += currentChar;
             }
             else
             {
-                // all other characters are replaced by their respective
-                // special character
+                // all other characters are replaced by their respective special
+                // character
                 switch (currentChar)
                 {
                     case 't':
@@ -2586,12 +2597,26 @@ std::string json::parser::parseString()
                         result += '\r';
                         break;
                     }
+                    case 'u':
+                    {
+                        // \uXXXX[\uXXXX] is used for escaping unicode, which
+                        // has it's own subroutine.
+                        result += parseUnicodeEscape();
+                        // the parsing process has brought us one step behind
+                        // the unicode escape sequence:
+                        // \uXXXX
+                        //       ^
+                        // we need to go one character back or the parser would
+                        // skip the character we are currently pointing at as
+                        // the for-loop will decrement pos_ after this iteration
+                        pos_--;
+                        break;
+                    }
                     default:
                     {
-                        error("expected one of \\, /, b, f, n, r, t behind backslash.");
+                        error("expected one of \\, /, b, f, n, r, t, u behind backslash.");
                     }
                 }
-                // TODO implement \uXXXX
             }
         }
         else
@@ -2610,7 +2635,7 @@ std::string json::parser::parseString()
             }
             else if (currentChar != '\\')
             {
-                // All non-backslash characters are added to the end of the
+                // all non-backslash characters are added to the end of the
                 // result string. The only backslashes we want in the result
                 // are the ones that are escaped (which happens above).
                 result += currentChar;
@@ -2638,6 +2663,187 @@ std::string json::parser::parseString()
     error("expected '\"'");
 }
 
+
+
+/*!
+Turns a code point into it's UTF-8 representation.
+You should only pass numbers < 0x10ffff into this function
+(everything else is a invalid code point).
+
+@return the UTF-8 representation of the given code point
+*/
+std::string json::parser::codePointToUTF8(unsigned int codePoint) const
+{
+    // this method contains a lot of bit manipulations to
+    // build the bytes for UTF-8.
+
+    // the '(... >> S) & 0xHH'-patterns are used to retrieve
+    // certain bits from the code points.
+
+    // all static casts in this method have boundary checks
+
+    // we initialize all strings with their final length
+    // (e.g. 1 to 4 bytes) to save the reallocations.
+
+    if (codePoint <= 0x7f)
+    {
+        // it's just a ASCII compatible codePoint,
+        // so we just interpret the point as a character
+        // and return ASCII
+
+        return std::string(1, static_cast<char>(codePoint));
+    }
+    // if true, we need two bytes to encode this as UTF-8
+    else if (codePoint <= 0x7ff)
+    {
+        // the 0xC0 enables the two most significant two bits
+        // to make this a two-byte UTF-8 character.
+        std::string result(2, static_cast<char>(0xC0 | ((codePoint >> 6) & 0x1F)));
+        result[1] = static_cast<char>(0x80 | (codePoint & 0x3F));
+        return result;
+    }
+    // if true, now we need three bytes to encode this as UTF-8
+    else if (codePoint <= 0xffff)
+    {
+        // the 0xE0 enables the three most significant two bits
+        // to make this a three-byte UTF-8 character.
+        std::string result(3, static_cast<char>(0xE0 | ((codePoint >> 12) & 0x0F)));
+        result[1] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
+        result[2] = static_cast<char>(0x80 | (codePoint & 0x3F));
+        return result;
+    }
+    // if true, we need maximal four bytes to encode this as UTF-8
+    else if (codePoint <= 0x10ffff)
+    {
+        // the 0xE0 enables the four most significant two bits
+        // to make this a three-byte UTF-8 character.
+        std::string result(4, static_cast<char>(0xF0 | ((codePoint >> 18) & 0x07)));
+        result[1] = static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F));
+        result[2] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
+        result[3] = static_cast<char>(0x80 | (codePoint & 0x3F));
+        return result;
+    }
+    else
+    {
+        // Can't be tested without direct access to this private method.
+        std::string errorMessage = "Invalid codePoint: ";
+        errorMessage += codePoint;
+        error(errorMessage);
+    }
+}
+
+/*!
+Parses 4 hexadecimal characters as a number.
+
+@return the value of the number the hexadecimal characters represent.
+
+@pre  pos_ is pointing to the first of the 4 hexadecimal characters.
+
+@post pos_ is pointing to the character after the 4 hexadecimal characters.
+*/
+unsigned int json::parser::parse4HexCodePoint()
+{
+    const auto startPos = pos_;
+
+    // check if the  remaining buffer is long enough to even hold 4 characters
+    if (pos_ + 3 >= buffer_.size())
+    {
+        error("Got end of input while parsing unicode escape sequence \\uXXXX");
+    }
+
+    // make a string that can hold the pair
+    std::string hexCode(4, ' ');
+
+    for (; pos_ < startPos + 4; pos_++)
+    {
+        // no boundary check here as we already checked above
+        char currentChar = buffer_[pos_];
+
+        // check if we have a hexadecimal character
+        if ((currentChar >= '0' and currentChar <= '9')
+                or (currentChar >= 'a' and currentChar <= 'f')
+                or (currentChar >= 'A' and currentChar <= 'F'))
+        {
+            // all is well, we have valid hexadecimal chars
+            // so we copy that char into our string
+            hexCode[pos_ - startPos] = currentChar;
+        }
+        else
+        {
+            error("Found non-hexadecimal character in unicode escape sequence!");
+        }
+    }
+    // the cast is safe as 4 hex characters can't present more than 16 bits
+    // the input to stoul was checked to contain only hexadecimal characters
+    // (see above)
+    return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
+}
+
+/*!
+Parses the unicode escape codes as defined in the ECMA-404.
+The escape sequence has two forms:
+1. \uXXXX
+2. \uXXXX\uYYYY
+where X and Y are a hexadecimal character (a-zA-Z0-9).
+
+Form 1 just contains the unicode code point in the hexadecimal number XXXX.
+Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low
+surrogate is YYYY.
+
+@return the UTF-8 character this unicode escape sequence escaped.
+
+@pre  pos_ is pointing at at the 'u' behind the first backslash.
+
+@post pos_ is pointing at the character behind the last X (or Y in form 2).
+*/
+std::string json::parser::parseUnicodeEscape()
+{
+    // jump to the first hex value
+    pos_++;
+    // parse the hex first hex values
+    unsigned int firstCodePoint = parse4HexCodePoint();
+
+    if (firstCodePoint >= 0xD800 and firstCodePoint <= 0xDBFF)
+    {
+        // we found invalid code points, which means we either have a malformed
+        // input or we found a high surrogate.
+        // we can only find out by seeing if the next character also wants to
+        // encode a unicode character (so, we have the \uXXXX\uXXXX case here).
+
+        // jump behind the next \u
+        pos_ += 2;
+        // try to parse the next hex values.
+        // the method does boundary checking for us, so no need to do that here
+        unsigned secondCodePoint = parse4HexCodePoint();
+        // ok, we have a low surrogate, check if it is a valid one
+        if (secondCodePoint >= 0xDC00 and secondCodePoint <= 0xDFFF)
+        {
+            // calculate the code point from the pair according to the spec
+            unsigned int finalCodePoint =
+                // high surrogate occupies the most significant 22 bits
+                (firstCodePoint << 10)
+                // low surrogate occupies the least significant 15 bits
+                + secondCodePoint
+                // there is still the 0xD800, 0xDC00 and 0x10000 noise in
+                // the result
+                // so we have to substract with:
+                // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                - 0x35FDC00;
+
+            // we transform the calculated point into UTF-8
+            return codePointToUTF8(finalCodePoint);
+        }
+        else
+        {
+            error("missing low surrogate");
+        }
+
+    }
+    // We have Form 1, so we just interpret the XXXX as a code point
+    return codePointToUTF8(firstCodePoint);
+}
+
+
 /*!
 This function is called in case a \p "t" is read in the main parse function
 @ref parse. In the standard, the \p "true" token is the only candidate, so the