From 6b78b5c2be7e24cd4a1c59c7fa13a62c2ae33508 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 19:05:09 -0500 Subject: [PATCH 01/18] Added strtonum for locale-independent number parsing --- src/json.hpp | 354 +++++++++++++++++++++++------------ src/json.hpp.re2c | 354 +++++++++++++++++++++++------------ test/src/unit-regression.cpp | 4 +- 3 files changed, 463 insertions(+), 249 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 6fed0a1225..e28333bd47 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,177 +9060,283 @@ class basic_json return result; } - /*! - @brief parse floating point number + + - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number - */ - long double str_to_float_t(long double* /* type */, char** endptr) const - { - return std::strtold(reinterpret_cast(m_start), endptr); - } - /*! - @brief parse floating point number - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } - /*! - @brief parse floating point number - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number + + /*! + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. + + e.g. const auto x = static_cast("123.4"); + + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - float str_to_float_t(float* /* type */, char** endptr) const + struct strtonum { - return std::strtof(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief return number value for number tokens + template::value>::type > + operator T() const + { + T val{0}; - This function translates the last token into the most appropriate - number type (either integer, unsigned integer or floating point), - which is passed back to the caller via the result parameter. + if(parse(val, std::is_integral())) { + return val; + } - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. - */ - void get_number(basic_json& result) const - { - assert(m_start != nullptr); + if(!p) { + return false; + } - const lexer::lexer_char_t* curptr = m_start; + if(*p == '-' or *p == '+') { + ++p; + } - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; + if(p == m_end) { + return false; + } + + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - // maximum absolute value of the relevant integer type - number_unsigned_t max; + return p == m_end; + } - // temporarily store the type to avoid unecessary bitfield access - value_t type; + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; - // look for sign - if (*curptr == '-') + static void strtof(float& f, + const char* str, + char** endptr) { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; + f = std::strtof(str, endptr); } - else + + static void strtof(double& f, + const char* str, + char** endptr) { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); + f = std::strtod(str, endptr); } - // count the significant figures - for (; curptr < m_cursor; curptr++) + static void strtof(long double& f, + const char* str, + char** endptr) { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') - { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - // skip if definitely not an integer - if (type != value_t::number_float) - { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; + if(ds_pos == len) { + break; // no decimal separator } - else - { - // no overflow - save it - value = temp; + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; } - } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; } - else if (type == value_t::number_integer) + + template + bool parse(T& value, /*is_integral=*/std::true_type) const { - result.m_value.number_integer = -static_cast(value); + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + }; + + /*! + @brief return number value for number tokens + + This function translates the last token into the most appropriate + number type (either integer, unsigned integer or floating point), + which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t + + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) + @param[out] result @ref basic_json object to receive the number. + */ + void get_number(basic_json& result) const + { + assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); + + const bool is_negative = *m_start == '-'; + + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - type = value_t::null; - result.m_value = basic_json::json_value(); + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else + { + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the type - result.m_type = type; + result.m_type = value_t::number_float; + result.m_value = static_cast(num); + + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); + } } private: diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index c6a99b89f3..72c413ba7f 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,177 +8209,283 @@ class basic_json return result; } - /*! - @brief parse floating point number + + - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number - */ - long double str_to_float_t(long double* /* type */, char** endptr) const - { - return std::strtold(reinterpret_cast(m_start), endptr); - } - /*! - @brief parse floating point number - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } - /*! - @brief parse floating point number - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - @param[in] type the @ref number_float_t in use - @param[in,out] endptr recieves a pointer to the first character after - the number - @return the floating point number + + /*! + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. + + e.g. const auto x = static_cast("123.4"); + + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - float str_to_float_t(float* /* type */, char** endptr) const + struct strtonum { - return std::strtof(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief return number value for number tokens + template::value>::type > + operator T() const + { + T val{0}; - This function translates the last token into the most appropriate - number type (either integer, unsigned integer or floating point), - which is passed back to the caller via the result parameter. + if(parse(val, std::is_integral())) { + return val; + } - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. - */ - void get_number(basic_json& result) const - { - assert(m_start != nullptr); + if(!p) { + return false; + } - const lexer::lexer_char_t* curptr = m_start; + if(*p == '-' or *p == '+') { + ++p; + } - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; + if(p == m_end) { + return false; + } + + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - // maximum absolute value of the relevant integer type - number_unsigned_t max; + return p == m_end; + } - // temporarily store the type to avoid unecessary bitfield access - value_t type; + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; - // look for sign - if (*curptr == '-') + static void strtof(float& f, + const char* str, + char** endptr) { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; + f = std::strtof(str, endptr); } - else + + static void strtof(double& f, + const char* str, + char** endptr) { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); + f = std::strtod(str, endptr); } - // count the significant figures - for (; curptr < m_cursor; curptr++) + static void strtof(long double& f, + const char* str, + char** endptr) { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') - { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - // skip if definitely not an integer - if (type != value_t::number_float) - { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; + if(ds_pos == len) { + break; // no decimal separator } - else - { - // no overflow - save it - value = temp; + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; } - } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; } - else if (type == value_t::number_integer) + + template + bool parse(T& value, /*is_integral=*/std::true_type) const { - result.m_value.number_integer = -static_cast(value); + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + }; + + /*! + @brief return number value for number tokens + + This function translates the last token into the most appropriate + number type (either integer, unsigned integer or floating point), + which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t + + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) + @param[out] result @ref basic_json object to receive the number. + */ + void get_number(basic_json& result) const + { + assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); + + const bool is_negative = *m_start == '-'; + + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - type = value_t::null; - result.m_value = basic_json::json_value(); + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else + { + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the type - result.m_type = type; + result.m_type = value_t::number_float; + result.m_value = static_cast(num); + + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); + } } private: diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index e04513caff..679b7f3d05 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -375,7 +375,7 @@ TEST_CASE("regression tests") }; // change locale to mess with decimal points - std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); + auto orig_locale = std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); CHECK(j1a.dump() == "23.42"); CHECK(j1b.dump() == "23.42"); @@ -399,6 +399,8 @@ TEST_CASE("regression tests") CHECK(j3c.dump() == "10000"); //CHECK(j3b.dump() == "1E04"); // roundtrip error //CHECK(j3c.dump() == "1e04"); // roundtrip error + + std::locale::global(orig_locale); } SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") From 4eafaab8164667a2c309ac5b346981542b0873e5 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 22:54:36 -0500 Subject: [PATCH 02/18] Improved overflow detection; removed debugging output statements. --- src/json.hpp | 47 ++++++++++++++++++++++++----------------------- src/json.hpp.re2c | 47 ++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index e28333bd47..70bfce5aa0 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9223,14 +9223,6 @@ class basic_json value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -9252,31 +9244,40 @@ class basic_json ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; - ++beg; - } + const unsigned_T upd_x = x * 10 + c; - if(is_negative) { - value = 0 - value; - } + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; + x = upd_x; + ++beg; } - return valid; + value = static_cast(is_negative ? 0 - x : x); + + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 72c413ba7f..418e302c30 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8372,14 +8372,6 @@ class basic_json value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -8401,31 +8393,40 @@ class basic_json ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; - ++beg; - } + const unsigned_T upd_x = x * 10 + c; - if(is_negative) { - value = 0 - value; - } + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; + x = upd_x; + ++beg; } - return valid; + value = static_cast(is_negative ? 0 - x : x); + + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; From c75efedc6ec7048244300a6305b25b2e117b440c Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 23:19:43 -0500 Subject: [PATCH 03/18] stylistic changes --- src/json.hpp | 92 ++++++++++++++++++++++++++--------------------- src/json.hpp.re2c | 92 ++++++++++++++++++++++++++--------------------- 2 files changed, 104 insertions(+), 80 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 70bfce5aa0..56949a4c76 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,22 +9060,6 @@ class basic_json return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -9086,6 +9070,9 @@ class basic_json throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -9101,7 +9088,8 @@ class basic_json { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -9113,24 +9101,35 @@ class basic_json + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -9141,6 +9140,9 @@ class basic_json const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -9173,20 +9175,22 @@ class basic_json std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -9194,31 +9198,33 @@ class basic_json // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -9232,15 +9238,17 @@ class basic_json const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -9255,7 +9263,8 @@ class basic_json and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -9310,10 +9319,11 @@ class basic_json const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -9325,7 +9335,9 @@ class basic_json result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 418e302c30..d97057a8c8 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,22 +8209,6 @@ class basic_json return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -8235,6 +8219,9 @@ class basic_json throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -8250,7 +8237,8 @@ class basic_json { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -8262,24 +8250,35 @@ class basic_json + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -8290,6 +8289,9 @@ class basic_json const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -8322,20 +8324,22 @@ class basic_json std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -8343,31 +8347,33 @@ class basic_json // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -8381,15 +8387,17 @@ class basic_json const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -8404,7 +8412,8 @@ class basic_json and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -8459,10 +8468,11 @@ class basic_json const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -8474,7 +8484,9 @@ class basic_json result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } From e41a956782fd64a2bdf253a13f05bc5e8239ec61 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sun, 4 Dec 2016 13:23:39 -0500 Subject: [PATCH 04/18] Alternative handling of integer types relying on strto[u]ll --- src/json.hpp | 46 ++++++++++++++++++++++++++++++++++++++++++++-- src/json.hpp.re2c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 56949a4c76..51b195185c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9073,6 +9073,8 @@ class basic_json note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -9232,6 +9234,45 @@ class basic_json return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -9240,7 +9281,7 @@ class basic_json if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -9261,7 +9302,7 @@ class basic_json bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -9288,6 +9329,7 @@ class basic_json // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index d97057a8c8..e5f344b223 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8222,6 +8222,8 @@ class basic_json note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -8381,6 +8383,45 @@ class basic_json return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -8389,7 +8430,7 @@ class basic_json if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -8410,7 +8451,7 @@ class basic_json bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -8437,6 +8478,7 @@ class basic_json // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! From d64336057569281386aafed096a6db77d8aa901a Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 00:43:12 -0500 Subject: [PATCH 05/18] Bugfix: when working with C formatting functions we need to query C locales (localeconv) rather than std::locale --- src/json.hpp | 12 +++++++++++- src/json.hpp.re2c | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 51b195185c..f1666c6c6c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9108,7 +9108,7 @@ class basic_json // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -9172,9 +9172,19 @@ class basic_json const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index e5f344b223..b805903546 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8257,7 +8257,7 @@ class basic_json // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -8321,9 +8321,19 @@ class basic_json const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed From 0c87d5d6b34d8595ceb645be1e0b82c4a4eaafe2 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:05 -0500 Subject: [PATCH 06/18] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index f1666c6c6c..19f1f109c2 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9169,33 +9169,28 @@ class basic_json template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -9203,7 +9198,7 @@ class basic_json if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -9215,18 +9210,19 @@ class basic_json std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 7a081244a575284246d38a594fb0436ca2b6dacc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:31 -0500 Subject: [PATCH 07/18] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp.re2c | 48 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b805903546..27a53fb785 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8318,33 +8318,28 @@ class basic_json template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -8352,7 +8347,7 @@ class basic_json if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -8364,18 +8359,19 @@ class basic_json std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 6e8da7d8c4aada79740076755ed487da505ceb28 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:45:48 -0500 Subject: [PATCH 08/18] Added unit-test for issue #379 (locale-independent str-to-num) --- test/src/unit-regression.cpp | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 679b7f3d05..2787fe2616 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -403,6 +403,44 @@ TEST_CASE("regression tests") std::locale::global(orig_locale); } + SECTION("issue #379 - locale-independent str-to-num") + { + const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + + setlocale(LC_NUMERIC, "de_DE"); + std::array buf; + + { + // verify that snprintf now uses commas as decimal-separator + std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); + assert(std::strcmp(buf.data(), "3,14") == 0); + + // verify that strtod now uses commas as decimal-separator + const double d1 = std::strtod(buf.data(), nullptr); + assert(d1 == 3.14); + + // verify that strtod does not understand dots as decimal separator + const double d2 = std::strtod("3.14", nullptr); + assert(d2 == 3); + } + + const json j1 = json::parse("3.14"); + + // verify that parsed correctly despite using strtod internally + CHECK(j1.get() == 3.14); + + // verify that dumped correctly despite using snprintf internally + CHECK(j1.dump() == "3.14"); + + // check a different code path + const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); + CHECK(j2.get() == 1.0); + + // restore original locale + setlocale(LC_ALL, orig_locale_name.c_str()); + } + + SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") { json source = {"a", "b", "c"}; From d2e9ce270a642e4ca7e00fc160c81eadb55a5639 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:18:20 -0500 Subject: [PATCH 09/18] Trying to coerce setlocale to make snprintf use commas as delimiter; the behavior appears to be compiler/platform-specific --- test/src/unit-regression.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 2787fe2616..17616a3e6d 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,21 +407,21 @@ TEST_CASE("regression tests") { const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "de_DE"); + setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); std::array buf; { // verify that snprintf now uses commas as decimal-separator std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - assert(std::strcmp(buf.data(), "3,14") == 0); + CHECK(std::strcmp(buf.data(), "3,14") == 0); // verify that strtod now uses commas as decimal-separator const double d1 = std::strtod(buf.data(), nullptr); - assert(d1 == 3.14); + CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator const double d2 = std::strtod("3.14", nullptr); - assert(d2 == 3); + CHECK(d2 == 3); } const json j1 = json::parse("3.14"); From d169598c6c236ce115a8453d917d99efc79fa3a3 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:20:48 -0500 Subject: [PATCH 10/18] simplified code a bit based on @gregmarr's suggestions --- src/json.hpp | 69 +++++++++++++++++++++------------------------ src/json.hpp.re2c | 71 ++++++++++++++++++++++------------------------- 2 files changed, 65 insertions(+), 75 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 19f1f109c2..1644d91f22 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9177,48 +9177,43 @@ class basic_json std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) - { - return m_start; // no decimal separator - } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else + + if (ds_pos != len) { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - }(); + } char* endptr = nullptr; value = 0; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 27a53fb785..b541db1e95 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8326,48 +8326,43 @@ class basic_json std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* - { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') + { const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) - { - return m_start; // no decimal separator - } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else + + if (ds_pos != len) { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - }(); + } char* endptr = nullptr; value = 0; From 6774457733524e81d7aa6fd0f3df6fd48423d0f7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:59:12 -0500 Subject: [PATCH 11/18] Trying to coerce setlocale to make snprintf use commas as delimiter some more --- test/src/unit-regression.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 17616a3e6d..41ddfa40cb 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,9 +405,14 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + // If I save the locale here and restore it in the end + // of this block, then setLocale(LC_NUMERIC, "de_DE") + // does not actually make snprintf use "," as decimal separator + // on some compilers. I have no idea... + // + //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); + setlocale(LC_NUMERIC, "de_DE"); std::array buf; { @@ -437,7 +442,7 @@ TEST_CASE("regression tests") CHECK(j2.get() == 1.0); // restore original locale - setlocale(LC_ALL, orig_locale_name.c_str()); + // setlocale(LC_ALL, orig_locale_name.c_str()); } From 0a4a6a8399d1f4194d009c50ec076dd287cafa9e Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:53:27 -0500 Subject: [PATCH 12/18] Refactored to avoid using exceptions, as there are plans to support exceptionless mode --- src/json.hpp | 76 ++++++++++++++++++++++++++++++++++++----------- src/json.hpp.re2c | 76 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 116 insertions(+), 36 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 1644d91f22..79d6bb0244 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9079,10 +9079,42 @@ class basic_json struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -9361,31 +9393,39 @@ class basic_json const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { + result.m_value = val; + } + } + else + { + number_unsigned_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) { - ; // overflow - will parse as double + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b541db1e95..24c5497555 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8228,10 +8228,42 @@ class basic_json struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -8510,31 +8542,39 @@ class basic_json const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { + result.m_value = val; + } + } + else + { + number_unsigned_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) { - ; // overflow - will parse as double + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) From 27d9740ad6d9b564beb0e995e0ab393c6ab3daf7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:55:07 -0500 Subject: [PATCH 13/18] Tweaks to unit-test for issue #379 --- test/src/unit-regression.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 41ddfa40cb..deace862ea 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -406,22 +406,27 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") + // of this block, then setLocale(LC_NUMERIC, "de_DE") // does not actually make snprintf use "," as decimal separator // on some compilers. I have no idea... - // //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - setlocale(LC_NUMERIC, "de_DE"); + + + // Still, just setting some comma-using locale does not + // make snprintf output commas on all platforms, so instead + // will change dot to comma in the current locale below + //setlocale(LC_NUMERIC, "de_DE"); + + auto loc = localeconv(); + auto orig_decimal_point = loc->decimal_point; + char comma[] = ","; + loc->decimal_point = comma; + std::array buf; { - // verify that snprintf now uses commas as decimal-separator - std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - CHECK(std::strcmp(buf.data(), "3,14") == 0); - // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod(buf.data(), nullptr); + const double d1 = std::strtod("3,14", nullptr); CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator @@ -429,18 +434,15 @@ TEST_CASE("regression tests") CHECK(d2 == 3); } - const json j1 = json::parse("3.14"); - // verify that parsed correctly despite using strtod internally + const json j1 = json::parse("3.14"); CHECK(j1.get() == 3.14); - // verify that dumped correctly despite using snprintf internally - CHECK(j1.dump() == "3.14"); - // check a different code path const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); CHECK(j2.get() == 1.0); + loc->decimal_point = orig_decimal_point; // restore original locale // setlocale(LC_ALL, orig_locale_name.c_str()); } From 38499e84fc771b21db7764ff4e014f7c3939d4b0 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 21:38:14 -0500 Subject: [PATCH 14/18] Removed unused struct; fixed comments --- src/json.hpp | 69 +++++++++-------------------------------------- src/json.hpp.re2c | 69 +++++++++-------------------------------------- 2 files changed, 24 insertions(+), 114 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 79d6bb0244..867afbb350 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9065,12 +9065,6 @@ class basic_json @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -9079,34 +9073,15 @@ class basic_json struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -9115,33 +9090,13 @@ class basic_json return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 24c5497555..5b2dc8d2d3 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8214,12 +8214,6 @@ class basic_json @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -8228,34 +8222,15 @@ class basic_json struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -8264,33 +8239,13 @@ class basic_json return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; From 1c029b97c0ebc90b5b7f1cb875b0904f115668b8 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 22:13:05 -0500 Subject: [PATCH 15/18] Still trying to invoke locale-specific behavior in CI --- test/src/unit-regression.cpp | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index deace862ea..5cd6356d35 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,46 +405,21 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") - // does not actually make snprintf use "," as decimal separator - // on some compilers. I have no idea... - //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - - // Still, just setting some comma-using locale does not - // make snprintf output commas on all platforms, so instead - // will change dot to comma in the current locale below - //setlocale(LC_NUMERIC, "de_DE"); - - auto loc = localeconv(); - auto orig_decimal_point = loc->decimal_point; - char comma[] = ","; - loc->decimal_point = comma; - - std::array buf; + setlocale(LC_NUMERIC, "de_DE.UTF-8"); { // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod("3,14", nullptr); - CHECK(d1 == 3.14); + CHECK(std::strtod("3,14", nullptr) == 3.14); // verify that strtod does not understand dots as decimal separator - const double d2 = std::strtod("3.14", nullptr); - CHECK(d2 == 3); + CHECK(std::strtod("3.14", nullptr) == 3); } // verify that parsed correctly despite using strtod internally - const json j1 = json::parse("3.14"); - CHECK(j1.get() == 3.14); + CHECK(json::parse("3.14").get() == 3.14); // check a different code path - const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); - CHECK(j2.get() == 1.0); - - loc->decimal_point = orig_decimal_point; - // restore original locale - // setlocale(LC_ALL, orig_locale_name.c_str()); + CHECK(json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000").get() == 1.0); } From cd0b651d43c303dbe0b8016a1305ab9e5fef6449 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:46:47 -0500 Subject: [PATCH 16/18] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 867afbb350..b526400cff 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9103,7 +9103,7 @@ class basic_json if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -9113,7 +9113,7 @@ class basic_json if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -9252,11 +9252,11 @@ class basic_json value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually From 0f8de48ddbc03b842ef5523b10553d3d319e1666 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:48:14 -0500 Subject: [PATCH 17/18] Disabling strtod pre-check, since can't get locale-specific behavior to manifest in AppVeyor --- test/src/unit-regression.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 5cd6356d35..7016fcafdf 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,6 +407,9 @@ TEST_CASE("regression tests") { setlocale(LC_NUMERIC, "de_DE.UTF-8"); + // disabled, because locale-specific beharivor is not + // triggered in AppVeyor for some reason +#if 0 { // verify that strtod now uses commas as decimal-separator CHECK(std::strtod("3,14", nullptr) == 3.14); @@ -414,6 +417,7 @@ TEST_CASE("regression tests") // verify that strtod does not understand dots as decimal separator CHECK(std::strtod("3.14", nullptr) == 3); } +#endif // verify that parsed correctly despite using strtod internally CHECK(json::parse("3.14").get() == 3.14); From 5cad2006eb99534274c380af1ad92d5b257c83bc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 20:15:57 -0500 Subject: [PATCH 18/18] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp.re2c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 5b2dc8d2d3..208489eabe 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8252,7 +8252,7 @@ class basic_json if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -8262,7 +8262,7 @@ class basic_json if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -8401,11 +8401,11 @@ class basic_json value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually