From c9a716c2b97d50c57e401b0b1cfbeb16f9c01127 Mon Sep 17 00:00:00 2001 From: "Wladimir J. van der Laan" Date: Fri, 15 Apr 2016 18:39:40 +0200 Subject: [PATCH] Handle UTF-8 This adds full UTF-8 support both on input and output. Input: read and validate full UTF-8, both Basic Multilingual Plane and extended characters. Collate surrogate pairs as specified in RFC4627. This ensures that UTF-8 strings that reach the application are always valid, and that invalid UTF-8 fails parsing. Output: Assume UTF-8 strings provided for output are valid. The escaping was broken, fix this by not encoding UTF-8 characters with \u. Writing them to the output stream as-is is the right thing to do. See https://www.ietf.org/rfc/rfc4627.txt: "JSON text SHALL be encoded in Unicode. The default encoding is UTF-8." Also add tests for the new functionality. Fixes #16. --- Makefile.am | 9 ++- lib/univalue_read.cpp | 37 +++++------- lib/univalue_utffilter.h | 119 +++++++++++++++++++++++++++++++++++++++ lib/univalue_write.cpp | 11 +--- test/fail38.json | 1 + test/fail39.json | 1 + test/fail40.json | 1 + test/fail41.json | 1 + test/round2.json | 1 + test/unitester.cpp | 31 ++++++++++ 10 files changed, 178 insertions(+), 34 deletions(-) create mode 100644 lib/univalue_utffilter.h create mode 100644 test/fail38.json create mode 100644 test/fail39.json create mode 100644 test/fail40.json create mode 100644 test/fail41.json create mode 100644 test/round2.json diff --git a/Makefile.am b/Makefile.am index 34fe9e3f13dbcd..6c1ec81e63fb7c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4 .INTERMEDIATE: $(GENBIN) include_HEADERS = include/univalue.h -noinst_HEADERS = lib/univalue_escapes.h +noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h lib_LTLIBRARIES = libunivalue.la @@ -73,6 +73,10 @@ TEST_FILES = \ $(TEST_DATA_DIR)/fail35.json \ $(TEST_DATA_DIR)/fail36.json \ $(TEST_DATA_DIR)/fail37.json \ + $(TEST_DATA_DIR)/fail38.json \ + $(TEST_DATA_DIR)/fail39.json \ + $(TEST_DATA_DIR)/fail40.json \ + $(TEST_DATA_DIR)/fail41.json \ $(TEST_DATA_DIR)/fail3.json \ $(TEST_DATA_DIR)/fail4.json \ $(TEST_DATA_DIR)/fail5.json \ @@ -83,6 +87,7 @@ TEST_FILES = \ $(TEST_DATA_DIR)/pass1.json \ $(TEST_DATA_DIR)/pass2.json \ $(TEST_DATA_DIR)/pass3.json \ - $(TEST_DATA_DIR)/round1.json + $(TEST_DATA_DIR)/round1.json \ + $(TEST_DATA_DIR)/round2.json EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS) diff --git a/lib/univalue_read.cpp b/lib/univalue_read.cpp index c7516b9628169d..95bac6958d0fa7 100644 --- a/lib/univalue_read.cpp +++ b/lib/univalue_read.cpp @@ -6,6 +6,7 @@ #include #include #include "univalue.h" +#include "univalue_utffilter.h" using namespace std; @@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed, raw++; // skip " string valStr; + JSONUTF8StringFilter writer(valStr); while (*raw) { - if (*raw < 0x20) + if ((unsigned char)*raw < 0x20) return JTOK_ERR; else if (*raw == '\\') { raw++; // skip backslash switch (*raw) { - case '"': valStr += "\""; break; - case '\\': valStr += "\\"; break; - case '/': valStr += "/"; break; - case 'b': valStr += "\b"; break; - case 'f': valStr += "\f"; break; - case 'n': valStr += "\n"; break; - case 'r': valStr += "\r"; break; - case 't': valStr += "\t"; break; + case '"': writer.push_back('\"'); break; + case '\\': writer.push_back('\\'); break; + case '/': writer.push_back('/'); break; + case 'b': writer.push_back('\b'); break; + case 'f': writer.push_back('\f'); break; + case 'n': writer.push_back('\n'); break; + case 'r': writer.push_back('\r'); break; + case 't': writer.push_back('\t'); break; case 'u': { unsigned int codepoint; if (hatoui(raw + 1, raw + 1 + 4, codepoint) != raw + 1 + 4) return JTOK_ERR; - - if (codepoint <= 0x7f) - valStr.push_back((char)codepoint); - else if (codepoint <= 0x7FF) { - valStr.push_back((char)(0xC0 | (codepoint >> 6))); - valStr.push_back((char)(0x80 | (codepoint & 0x3F))); - } else if (codepoint <= 0xFFFF) { - valStr.push_back((char)(0xE0 | (codepoint >> 12))); - valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); - valStr.push_back((char)(0x80 | (codepoint & 0x3F))); - } - + writer.push_back_u(codepoint); raw += 4; break; } @@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed, } else { - valStr += *raw; + writer.push_back(*raw); raw++; } } + if (!writer.finalize()) + return JTOK_ERR; tokenVal = valStr; consumed = (raw - rawStart); return JTOK_STRING; diff --git a/lib/univalue_utffilter.h b/lib/univalue_utffilter.h new file mode 100644 index 00000000000000..0e330dce9cd04a --- /dev/null +++ b/lib/univalue_utffilter.h @@ -0,0 +1,119 @@ +// Copyright 2016 Wladimir J. van der Laan +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. +#ifndef UNIVALUE_UTFFILTER_H +#define UNIVALUE_UTFFILTER_H + +#include + +/** + * Filter that generates and validates UTF-8, as well as collates UTF-16 + * surrogate pairs as specified in RFC4627. + */ +class JSONUTF8StringFilter +{ +public: + JSONUTF8StringFilter(std::string &s): + str(s), is_valid(true), codepoint(0), state(0), surpair(0) + { + } + // Write single 8-bit char (may be part of UTF-8 sequence) + void push_back(unsigned char ch) + { + if (state == 0) { + if (ch < 0x80) // 7-bit ASCII, fast direct pass-through + str.push_back(ch); + else if (ch < 0xc0) // Mid-sequence character, invalid in this state + is_valid = false; + else if (ch < 0xe0) { // Start of 2-byte sequence + codepoint = (ch & 0x1f) << 6; + state = 6; + } else if (ch < 0xf0) { // Start of 3-byte sequence + codepoint = (ch & 0x0f) << 12; + state = 12; + } else if (ch < 0xf8) { // Start of 4-byte sequence + codepoint = (ch & 0x07) << 18; + state = 18; + } else // Reserved, invalid + is_valid = false; + } else { + if ((ch & 0xc0) != 0x80) // Not a continuation, invalid + is_valid = false; + state -= 6; + codepoint |= (ch & 0x3f) << state; + if (state == 0) + push_back_u(codepoint); + } + } + // Write codepoint directly, possibly collating surrogate pairs + void push_back_u(unsigned int codepoint) + { + if (state) // Only accept full codepoints in open state + is_valid = false; + if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair + if (surpair) // Two subsequent surrogate pair openers - fail + is_valid = false; + else + surpair = codepoint; + } else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair + if (surpair) { // Open surrogate pair, expect second half + // Compute code point from UTF-16 surrogate pair + append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00)); + surpair = 0; + } else // Second half doesn't follow a first half - fail + is_valid = false; + } else { + if (surpair) // First half of surrogate pair not followed by second - fail + is_valid = false; + else + append_codepoint(codepoint); + } + } + // Check that we're in a state where the string can be ended + // No open sequences, no open surrogate pairs, etc + bool finalize() + { + if (state || surpair) + is_valid = false; + return is_valid; + } +private: + std::string &str; + bool is_valid; + // Current UTF-8 decoding state + unsigned int codepoint; + int state; // Top bit to be filled in for next UTF-8 byte, or 0 + + // Keep track of the following state to handle the following section of + // RFC4627: + // + // To escape an extended character that is not in the Basic Multilingual + // Plane, the character is represented as a twelve-character sequence, + // encoding the UTF-16 surrogate pair. So, for example, a string + // containing only the G clef character (U+1D11E) may be represented as + // "\uD834\uDD1E". + // + // Two subsequent \u.... may have to be replaced with one actual codepoint. + unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 + + void append_codepoint(unsigned int codepoint) + { + if (codepoint <= 0x7f) + str.push_back((char)codepoint); + else if (codepoint <= 0x7FF) { + str.push_back((char)(0xC0 | (codepoint >> 6))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + str.push_back((char)(0xE0 | (codepoint >> 12))); + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0x1FFFFF) { + str.push_back((char)(0xF0 | (codepoint >> 18))); + str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F))); + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } + } +}; + +#endif diff --git a/lib/univalue_write.cpp b/lib/univalue_write.cpp index ceb4cc9166cd22..cfbdad3284ed53 100644 --- a/lib/univalue_write.cpp +++ b/lib/univalue_write.cpp @@ -8,8 +8,6 @@ #include "univalue.h" #include "univalue_escapes.h" -// TODO: Using UTF8 - using namespace std; static string json_escape(const string& inS) @@ -23,15 +21,8 @@ static string json_escape(const string& inS) if (escStr) outS += escStr; - - else if (ch < 0x80) + else outS += ch; - - else { // TODO handle UTF-8 properly - char tmpesc[16]; - sprintf(tmpesc, "\\u%04x", ch); - outS += tmpesc; - } } return outS; diff --git a/test/fail38.json b/test/fail38.json new file mode 100644 index 00000000000000..b245e2e46cad56 --- /dev/null +++ b/test/fail38.json @@ -0,0 +1 @@ +["\ud834"] diff --git a/test/fail39.json b/test/fail39.json new file mode 100644 index 00000000000000..7c9e263f27de5d --- /dev/null +++ b/test/fail39.json @@ -0,0 +1 @@ +["\udd61"] diff --git a/test/fail40.json b/test/fail40.json new file mode 100644 index 00000000000000..664dc9e245f65c --- /dev/null +++ b/test/fail40.json @@ -0,0 +1 @@ +["���"] \ No newline at end of file diff --git a/test/fail41.json b/test/fail41.json new file mode 100644 index 00000000000000..0de342a2b5fdf5 --- /dev/null +++ b/test/fail41.json @@ -0,0 +1 @@ +["�"] \ No newline at end of file diff --git a/test/round2.json b/test/round2.json new file mode 100644 index 00000000000000..b766cccc688992 --- /dev/null +++ b/test/round2.json @@ -0,0 +1 @@ +["a§■𐎒𝅘𝅥𝅯"] diff --git a/test/unitester.cpp b/test/unitester.cpp index 5a052fe92c85be..05f3842cd1eb63 100644 --- a/test/unitester.cpp +++ b/test/unitester.cpp @@ -22,6 +22,7 @@ string srcdir(JSON_TEST_SRC); static bool test_failed = false; #define d_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", filename.c_str()); } } +#define f_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", __func__); } } static std::string rtrim(std::string s) { @@ -108,6 +109,10 @@ static const char *filenames[] = { "fail35.json", "fail36.json", "fail37.json", + "fail38.json", // invalid unicode: only first half of surrogate pair + "fail39.json", // invalid unicode: only second half of surrogate pair + "fail40.json", // invalid unicode: broken UTF-8 + "fail41.json", // invalid unicode: unfinished UTF-8 "fail3.json", "fail4.json", // extra comma "fail5.json", @@ -119,14 +124,40 @@ static const char *filenames[] = { "pass2.json", "pass3.json", "round1.json", // round-trip test + "round2.json", // unicode }; +// Test \u handling +void unescape_unicode_test() +{ + UniValue val; + bool testResult; + // Escaped ASCII (quote) + testResult = val.read("[\"\\u0022\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\""); + // Escaped Basic Plane character, two-byte UTF-8 + testResult = val.read("[\"\\u0191\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xc6\x91"); + // Escaped Basic Plane character, three-byte UTF-8 + testResult = val.read("[\"\\u2191\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xe2\x86\x91"); + // Escaped Supplementary Plane character U+1d161 + testResult = val.read("[\"\\ud834\\udd61\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xf0\x9d\x85\xa1"); +} + int main (int argc, char *argv[]) { for (unsigned int fidx = 0; fidx < ARRAY_SIZE(filenames); fidx++) { runtest_file(filenames[fidx]); } + unescape_unicode_test(); + return test_failed ? 1 : 0; }