From 3cdcef3382168b27e78ed1b2943110aa69be5b08 Mon Sep 17 00:00:00 2001 From: alandefreitas Date: Fri, 4 Mar 2022 01:37:50 -0300 Subject: [PATCH] syntax-based normalization fix #8, fix #65, close #136 --- include/boost/url/detail/impl/normalize.ipp | 259 +++++++++ .../boost/url/detail/impl/pct_encoding.ipp | 1 + .../url/detail/impl/remove_dot_segments.ipp | 540 ++++++++++++++++++ include/boost/url/detail/normalize.hpp | 64 +++ include/boost/url/detail/pct_encoding.hpp | 12 +- .../boost/url/detail/remove_dot_segments.hpp | 38 ++ include/boost/url/grammar/ascii.hpp | 32 +- include/boost/url/impl/url.hpp | 28 - include/boost/url/impl/url.ipp | 330 ++++++++--- include/boost/url/impl/url_view.ipp | 61 ++ include/boost/url/ipv4_address.hpp | 2 +- include/boost/url/ipv6_address.hpp | 2 +- include/boost/url/pct_encoding.hpp | 3 +- include/boost/url/rfc/detail/charsets.hpp | 112 ++++ include/boost/url/rfc/reg_name_rule.hpp | 1 - include/boost/url/src.hpp | 3 + include/boost/url/url.hpp | 157 ++++- include/boost/url/url_view.hpp | 26 + test/unit/url.cpp | 117 ++++ 19 files changed, 1648 insertions(+), 140 deletions(-) create mode 100644 include/boost/url/detail/impl/normalize.ipp create mode 100644 include/boost/url/detail/impl/remove_dot_segments.ipp create mode 100644 include/boost/url/detail/normalize.hpp create mode 100644 include/boost/url/detail/remove_dot_segments.hpp delete mode 100644 include/boost/url/impl/url.hpp create mode 100644 include/boost/url/rfc/detail/charsets.hpp diff --git a/include/boost/url/detail/impl/normalize.ipp b/include/boost/url/detail/impl/normalize.ipp new file mode 100644 index 00000000..5ae8a971 --- /dev/null +++ b/include/boost/url/detail/impl/normalize.ipp @@ -0,0 +1,259 @@ +// +// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/CPPAlliance/url +// + +#ifndef BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP +#define BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP + +#include +#include +#include +#include + +namespace boost { +namespace urls { +namespace detail { + +int +compare_encoded( + string_view lhs, + string_view rhs) noexcept +{ + auto consume_one = []( + string_view::iterator& it, + char& c, + std::size_t& n) + { + if(*it != '%') + { + c = *it; + ++it; + } + else + { + pct_decode_unchecked( + &c, + &c + 1, + string_view(it, 3)); + it += 3; + } + ++n; + }; + + std::size_t n0 = 0; + std::size_t n1 = 0; + auto it0 = lhs.begin(); + auto it1 = rhs.begin(); + auto end0 = lhs.end(); + auto end1 = rhs.end(); + char c0 = 0; + char c1 = 0; + while( + it0 < end0 && + it1 < end1) + { + consume_one(it0, c0, n0); + consume_one(it1, c1, n1); + if (c0 < c1) + return -1; + if (c1 < c0) + return 1; + } + n0 += pct_decode_bytes_unchecked( + string_view(it0, end0 - it0)); + n1 += pct_decode_bytes_unchecked( + string_view(it1, end1 - it1)); + if (n0 == n1) + return 0; + if (n0 < n1) + return -1; + return 1; +} + +int +ci_compare_encoded( + string_view lhs, + string_view rhs) noexcept +{ + auto consume_one = + []( string_view::iterator& it, + char &c, + std::size_t& n) + { + if(*it != '%') + { + c = grammar::ascii_tolower(*it); + ++it; + } + else + { + pct_decode_unchecked( + &c, + &c + 1, + string_view(it, 3)); + c = grammar::ascii_tolower(c); + it += 3; + } + ++n; + }; + + std::size_t n0 = 0; + std::size_t n1 = 0; + auto it0 = lhs.begin(); + auto it1 = rhs.begin(); + auto end0 = lhs.end(); + auto end1 = rhs.end(); + char c0 = 0; + char c1 = 0; + while ( + it0 < end0 && + it1 < end1) + { + consume_one(it0, c0, n0); + consume_one(it1, c1, n1); + if (c0 < c1) + return -1; + if (c1 < c0) + return 1; + } + n0 += pct_decode_bytes_unchecked( + string_view(it0, end0 - it0)); + n1 += pct_decode_bytes_unchecked( + string_view(it1, end1 - it1)); + if (n0 == n1) + return 0; + if (n0 < n1) + return -1; + return 1; +} + +int +ci_compare( + string_view lhs, + string_view rhs) noexcept +{ + auto rlen = (std::min)(lhs.size(), rhs.size()); + for (std::size_t i = 0; i < rlen; ++i) + { + char c0 = grammar::ascii_tolower(lhs[i]); + char c1 = grammar::ascii_tolower(rhs[i]); + if (c0 < c1) + return -1; + if (c1 < c0) + return 1; + } + if ( lhs.size() == rhs.size() ) + return 0; + if ( lhs.size() < rhs.size() ) + return -1; + return 1; +} + +std::size_t +path_starts_with( + string_view lhs, + string_view rhs) noexcept +{ + auto consume_one = []( + string_view::iterator& it, + char &c) + { + if(*it != '%') + { + c = *it; + ++it; + return; + } + pct_decode_unchecked( + &c, + &c + 1, + string_view(it, 3)); + if (c != '/') + { + it += 3; + return; + } + c = *it; + ++it; + }; + + auto it0 = lhs.begin(); + auto it1 = rhs.begin(); + auto end0 = lhs.end(); + auto end1 = rhs.end(); + char c0 = 0; + char c1 = 0; + while ( + it0 < end0 && + it1 < end1) + { + consume_one(it0, c0); + consume_one(it1, c1); + if (c0 != c1) + return 0; + } + if (it1 == end1) + return it0 - lhs.begin(); + return 0; +} + +std::size_t +path_ends_with( + string_view lhs, + string_view rhs) noexcept +{ + auto consume_last = []( + string_view::iterator& it, + string_view::iterator& end, + char& c) + { + if ((end - it) < 3 || + *(std::prev(end, 3)) != '%') + { + c = *--end; + return; + } + pct_decode_unchecked( + &c, + &c + 1, + string_view(std::prev( + end, 3), 3)); + if (c != '/') + { + end -= 3; + return; + } + c = *--end; + }; + + auto it0 = lhs.begin(); + auto it1 = rhs.begin(); + auto end0 = lhs.end(); + auto end1 = rhs.end(); + char c0 = 0; + char c1 = 0; + while( + it0 < end0 && + it1 < end1) + { + consume_last(it0, end0, c0); + consume_last(it1, end1, c1); + if (c0 != c1) + return 0; + } + if (it1 == end1) + return lhs.end() - end0; + return 0; +} + +} // detail +} // urls +} // boost + +#endif diff --git a/include/boost/url/detail/impl/pct_encoding.ipp b/include/boost/url/detail/impl/pct_encoding.ipp index d2565fdb..d10014f1 100644 --- a/include/boost/url/detail/impl/pct_encoding.ipp +++ b/include/boost/url/detail/impl/pct_encoding.ipp @@ -1,5 +1,6 @@ // // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/boost/url/detail/impl/remove_dot_segments.ipp b/include/boost/url/detail/impl/remove_dot_segments.ipp new file mode 100644 index 00000000..f977b849 --- /dev/null +++ b/include/boost/url/detail/impl/remove_dot_segments.ipp @@ -0,0 +1,540 @@ +// +// Copyright (c) 2022 alandefreitas (alandefreitas@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/CPPAlliance/url +// + +#ifndef BOOST_URL_DETAIL_IMPL_REMOVE_DOT_SEGMENTS_IPP +#define BOOST_URL_DETAIL_IMPL_REMOVE_DOT_SEGMENTS_IPP + +#include +#include +#include + +namespace boost { +namespace urls { +namespace detail { + +std::size_t +remove_dot_segments( + char* dest0, + char const* end, + string_view s, + bool remove_unmatched) noexcept +{ + // 1. The input buffer is initialized with + // the now-appended path components and the + // output buffer is initialized to the empty + // string. + char* dest = dest0; + auto append = + [&dest, &end] + (string_view in) + { + BOOST_ASSERT(in.size() <= std::size_t(end - dest)); + std::memcpy(dest, in.data(), in.size()); + dest += in.size(); + }; + + auto find_last_slash = + [&dest0, &dest]() -> std::size_t + { + char* const first = dest0; + char* last = dest; + while (last != first) + { + --last; + if (*last == '/') + return last - first; + } + return string_view::npos; + }; + + // Step 2 is a loop through 5 production rules + // There are no transitions between all rules, + // which enables some optimizations. + // A. If the input buffer begins with a + // prefix of "../" or "./", then remove + // that prefix from the input buffer; + // otherwise, + // Rule A can only happen at the beginning: + // - B and C write "/" to the output + // - D can only happen at the end + // - E leaves "/" or happens at the end + while (!s.empty()) + { + if (s.starts_with("../")) + { + if (!remove_unmatched) + append(s.substr(0, 3)); + s.remove_prefix(3); + continue; + } + if (!s.starts_with("./")) + break; + s.remove_prefix(2); + } + + // D. if the input buffer consists only + // of "." or "..", then remove that from + // the input buffer; otherwise, + // Rule D can only happen after A is consumed: + // - B and C write "/" to the output + // - D can only happen at the end + // - E leaves "/" or happens at the end + if( s == "." || + s == "..") + { + if( ! remove_unmatched && + s == "..") + append(s); + s = {}; + } + + // 2. While the input buffer is not empty, + // loop as follows: + while (!s.empty()) + { + // B. if the input buffer begins with a + // prefix of "/./" or "/.", where "." is + // a complete path segment, then replace + // that prefix with "/" in the input + // buffer; otherwise, + if (s.starts_with("/./")) + { + s.remove_prefix(2); + continue; + } + if (s == "/.") + { + // equivalent to replacing s with '/' + // and executing the next iteration + append(s.substr(0, 1)); + s.remove_prefix(2); + continue; + } + + // C. if the input buffer begins with a + // prefix of "/../" or "/..", where ".." + // is a complete path segment, then + // replace that prefix with "/" in the + // input buffer and remove the last + // segment and its preceding "/" + // (if any) from the output buffer; + // otherwise, + if (s.starts_with("/../")) + { + std::size_t p = find_last_slash(); + if (p != string_view::npos) + // "erase" [p, end] + dest = dest0 + p; + else if (!remove_unmatched) + append(s.substr(0, 3)); + s.remove_prefix(3); + continue; + } + if (s == "/..") + { + std::size_t p = find_last_slash(); + if (p != string_view::npos) + { + // erase [p, end] + dest = dest0 + p; + // equivalent to replacing s with '/' + // and executing the next iteration. + // this is the only point that would + // require input memory allocations + // in remove_dot_segments + append(s.substr(0, 1)); + } + else if (remove_unmatched) + append(s.substr(0, 1)); + else + append(s.substr(0, 3)); + s.remove_prefix(3); + continue; + } + + // E. move the first path segment in the + // input buffer to the end of the output + // buffer, including the initial "/" + // character (if any) and any subsequent + // characters up to, but not including, + // the next "/" character or the end of + // the input buffer. + std::size_t p = s.find_first_of('/', 1); + if (p != string_view::npos) + { + append(s.substr(0, p)); + s.remove_prefix(p); + } + else + { + append(s); + s = {}; + } + } + + // 3. Finally, the output buffer is returned + // as the result of remove_dot_segments. + return dest - dest0; +} + +int +normalized_path_compare( + string_view s0_init, + string_view s1_init, + bool r0, + bool r1) noexcept +{ + // Pseudocode: + // Execute remove_dot_segments iterations in reverse: + // - keep track of number of elements + // - keep track of normalized size + // Iterate the both path segments in reverse again: + // - use normalized size to identify the + // positions we are comparing + + // 1. The input buffer is initialized with + // the now-appended path components and the + // output buffer is initialized to the empty + // string. + // - These memory allocations are logically removed. + + // Step 2 is a loop through 5 production rules + // There are no transitions between all rules, + // which enables some optimizations, such as + // ignoring the prefix rules before applying + // other rules. + auto remove_prefix = + []( string_view& s ) + { + // A. If the input buffer begins with a + // prefix of "../" or "./", then remove + // that prefix from the input buffer, + // otherwise, + // Rule A can only happen at the beginning: + // - B and C write "/" to the output + // - D can only happen at the end + // - E leaves "/" or happens at the end + std::size_t out = 0; + std::size_t n = 0; + while (!s.empty()) + { + n = detail::path_starts_with(s, "../"); + if (n) + { + out += 3; + s.remove_prefix(n); + continue; + } + n = detail::path_starts_with(s, "./"); + if (n) + { + s.remove_prefix(n); + continue; + } + break; + } + + // D. if the input buffer consists only + // of "." or "..", then remove that from + // the input buffer; otherwise, ... + // Rule D can only happen after A is consumed: + // - B and C write "/" to the output + // - D can only happen at the end + // - E leaves "/" or happens at the end + if (detail::compare_encoded(s, ".") == 0) + s = {}; + else if (detail::compare_encoded(s, "..") == 0) + { + out += 2; + s = {}; + } + return out; + }; + string_view s0 = s0_init; + string_view s1 = s1_init; + std::size_t s0_prefix_n = remove_prefix(s0); + std::size_t s1_prefix_n = remove_prefix(s1); + + auto pop_last = []( + string_view& s, + string_view& c, + std::size_t& level, + bool r) + { + c = {}; + std::size_t n = 0; + while (!s.empty()) + { + // B. if the input buffer begins with a + // prefix of "/./" or "/.", where "." is + // a complete path segment, then replace + // that prefix with "/" in the input + // buffer; otherwise, + n = detail::path_ends_with(s, "/./"); + if (n) + { + c = s.substr(s.size() - n); + s.remove_suffix(n); + continue; + } + n = detail::path_ends_with(s, "/."); + if (n) + { + c = s.substr(s.size() - n, 1); + s.remove_suffix(n); + continue; + } + + // C. if the input buffer begins with a + // prefix of "/../" or "/..", where ".." + // is a complete path segment, then + // replace that prefix with "/" in the + // input buffer and remove the last + // segment and its preceding "/" + // (if any) from the output buffer + // otherwise, + n = detail::path_ends_with(s, "/../"); + if (n) + { + c = s.substr(s.size() - n); + s.remove_suffix(n); + ++level; + continue; + } + n = detail::path_ends_with(s, "/.."); + if (n) + { + c = s.substr(s.size() - n); + s.remove_suffix(n); + ++level; + continue; + } + + // E. move the first path segment in the + // input buffer to the end of the output + // buffer, including the initial "/" + // character (if any) and any subsequent + // characters up to, but not including, + // the next "/" character or the end of + // the input buffer. + std::size_t p = s.size() > 1 + ? s.find_last_of('/', s.size() - 2) + : string_view::npos; + if (p != string_view::npos) + { + c = s.substr(p + 1); + s.remove_suffix(c.size()); + } + else + { + c = s; + s = {}; + } + + if (level == 0) + return; + if (!s.empty()) + --level; + } + // we still need to skip n_skip + 1 + // but the string is empty + if (r && level) + { + c = "/"; + level = 0; + return; + } + else if (level) + { + if (c.empty()) + c = "/.."; + else + c = "/../"; + --level; + return; + } + c = {}; + }; + + // number of decoded bytes in a path segment + auto path_decoded_bytes = + []( string_view s ) + { + auto it = s.data(); + auto const end = + it + s.size(); + std::size_t n = 0; + while(it < end) + { + if(*it != '%') + { + // unescaped + ++it; + ++n; + continue; + } + if(end - it < 3) + return n; + char c = 0; + pct_decode_unchecked( + &c, + &c + 1, + string_view(it, 3)); + if (c != '/') + it += 3; + else + ++it; + ++n; + } + return n; + }; + + // Calculate the normalized size + auto norm_bytes = + [&pop_last, &path_decoded_bytes] + ( string_view p, + bool r) + { + string_view c; + std::size_t s{0}; + std::size_t n{0}; + do + { + pop_last(p, c, s, r); + n += path_decoded_bytes(c); + } + while (!c.empty()); + return n; + }; + std::size_t s0n = norm_bytes( + s0, + r0); + if (!r0) + s0n += s0_prefix_n; + + std::size_t s1n = norm_bytes( + s1, + r1); + if (!r1) + s1n += s1_prefix_n; + + // Remove child segments until last intersection + s0 = s0_init; + s1 = s1_init; + string_view s0c; + string_view s1c; + std::size_t s0l = 0; + std::size_t s1l = 0; + std::size_t s0i = s0n; + std::size_t s1i = s1n; + pop_last( + s0, s0c, s0l, + r0); + pop_last( + s1, s1c, s1l, + r1); + + // Consume incomparable segments + auto pop_decoded_back = + []( string_view& s ) + { + if (s.size() < 3 || + *std::prev(s.end(), 3) != '%') + { + char c = s.back(); + s.remove_suffix(1); + return c; + } + char c = 0; + pct_decode_unchecked( + &c, &c + 1, s.substr(s.size() - 3)); + if (c != '/') + { + s.remove_suffix(3); + return c; + } + c = s.back(); + s.remove_suffix(1); + return c; + }; + + while (s0i != s1i) + { + // Consume more child segments + if (s0c.empty()) + pop_last( + s0, s0c, s0l, r0); + if (s1c.empty()) + pop_last( + s1, s1c, s1l, r1); + + // Remove incomparable suffix + while ( + !s0c.empty() && + !s1c.empty()) + { + if (s1i > s0i) + { + pop_decoded_back(s1c); + --s1i; + continue; + } + else if (s0i > s1i) + { + pop_decoded_back(s0c); + --s0i; + continue; + } + break; + } + } + + int cmp = 0; + BOOST_ASSERT(s0i == s1i); + while (s0i > 0) + { + // Consume more child segments + if (s0c.empty()) + pop_last( + s0, s0c, s0l, r0); + if (s1c.empty()) + pop_last( + s1, s1c, s1l, r1); + + // Compare intersection + while ( + !s0c.empty() && + !s1c.empty()) + { + BOOST_ASSERT(s0i == s1i); + char c0 = pop_decoded_back(s0c); + char c1 = pop_decoded_back(s1c); + if (c0 < c1) + cmp = -1; + else if (c1 < c0) + cmp = 1; + --s0i; + --s1i; + } + } + + if (cmp != 0) + return cmp; + if (s0n == s1n ) + return 0; + if (s0n < s1n ) + return -1; + return 1; +} + +} // detail +} // urls +} // boost + +#endif diff --git a/include/boost/url/detail/normalize.hpp b/include/boost/url/detail/normalize.hpp new file mode 100644 index 00000000..d3161455 --- /dev/null +++ b/include/boost/url/detail/normalize.hpp @@ -0,0 +1,64 @@ +// +// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/CPPAlliance/url +// + +#ifndef BOOST_URL_DETAIL_NORMALIZED_HPP +#define BOOST_URL_DETAIL_NORMALIZED_HPP + +#include +#include + +namespace boost { +namespace urls { +namespace detail { + +// compare two string_views as if they are both +// percent-decoded +int +compare_encoded( + string_view lhs, + string_view rhs) noexcept; + +// check if string_view lhs starts with string_view +// rhs as if they are both percent-decoded. If +// lhs starts with rhs, return number of chars +// matched in the encoded string_view +std::size_t +path_starts_with( + string_view lhs, + string_view rhs) noexcept; + +// check if string_view lhs ends with string_view +// rhs as if they are both percent-decoded. If +// lhs ends with rhs, return number of chars +// matched in the encoded string_view +std::size_t +path_ends_with( + string_view lhs, + string_view rhs) noexcept; + +// compare two string_views as if they are both +// percent-decoded and lowercase +int +ci_compare_encoded( + string_view lhs, + string_view rhs) noexcept; + +// compare two string_views as if they are both +// lowercase +int +ci_compare( + string_view lhs, + string_view rhs) noexcept; + +} // detail +} // urls +} // boost + +#endif diff --git a/include/boost/url/detail/pct_encoding.hpp b/include/boost/url/detail/pct_encoding.hpp index 4d051d50..f08485a2 100644 --- a/include/boost/url/detail/pct_encoding.hpp +++ b/include/boost/url/detail/pct_encoding.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -21,17 +22,6 @@ namespace boost { namespace urls { namespace detail { -/** Return true if plain equals a decoded percent-encoded string - - This function compares a plain key to a - percent-encoded string. The comparison is - made as if the key were percent-encoded. - - @param plain_key The key to use for comparison. - - @param encoded The percent-encoded string to - compare to. -*/ BOOST_URL_DECL bool key_equal_encoded( diff --git a/include/boost/url/detail/remove_dot_segments.hpp b/include/boost/url/detail/remove_dot_segments.hpp new file mode 100644 index 00000000..f8dfddd5 --- /dev/null +++ b/include/boost/url/detail/remove_dot_segments.hpp @@ -0,0 +1,38 @@ +// +// Copyright (c) 2022 alandefreitas (alandefreitas@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/CPPAlliance/url +// + +#ifndef BOOST_URL_DETAIL_REMOVE_DOT_SEGMENTS_HPP +#define BOOST_URL_DETAIL_REMOVE_DOT_SEGMENTS_HPP + +#include +#include + +namespace boost { +namespace urls { +namespace detail { + +std::size_t +remove_dot_segments( + char* dest, + char const* end, + string_view s, + bool remove_unmatched) noexcept; + +int +normalized_path_compare( + string_view lhs, + string_view rhs, + bool remove_unmatched_lhs, + bool remove_unmatched_rhs) noexcept; + +} // detail +} // urls +} // boost + +#endif diff --git a/include/boost/url/grammar/ascii.hpp b/include/boost/url/grammar/ascii.hpp index fba82546..a4831c33 100644 --- a/include/boost/url/grammar/ascii.hpp +++ b/include/boost/url/grammar/ascii.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -40,8 +41,35 @@ constexpr char ascii_tolower(char c) noexcept { - return (c >= 'A' && c <= 'Z') - ? c + 'a' - 'A' : c; + return + (c >= 'A' && + c <= 'Z') + ? c + 'a' - 'A' + : c; +} + +/** Return c converted to upper case + + This function returns the lowercase equivalent + if a character is an upper-case letter, otherwise + returns the same character. + + @par Exception Safety + Throws nothing. + + @return The converted character + + @param c The ascii character to convert +*/ +constexpr +char +ascii_toupper(char c) noexcept +{ + return + (c >= 'a' && + c <= 'z') + ? c - ('a' - 'A') + : c; } } // grammar diff --git a/include/boost/url/impl/url.hpp b/include/boost/url/impl/url.hpp deleted file mode 100644 index d4d73f43..00000000 --- a/include/boost/url/impl/url.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// -// Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -// -// Official repository: https://github.com/CPPAlliance/url -// - -#ifndef BOOST_URL_IMPL_URL_HPP -#define BOOST_URL_IMPL_URL_HPP - -namespace boost { -namespace urls { - -template -urls::segments -url:: -segments( - Allocator const& a) noexcept -{ - return urls::segments(*this, a); -} - -} // urls -} // boost - -#endif \ No newline at end of file diff --git a/include/boost/url/impl/url.ipp b/include/boost/url/impl/url.ipp index 465e112c..98f03613 100644 --- a/include/boost/url/impl/url.ipp +++ b/include/boost/url/impl/url.ipp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -428,14 +431,11 @@ set_user(string_view s) this->string()); s = buf.maybe_copy(s); check_invariants(); - static constexpr auto cs = - unreserved_chars + - subdelim_chars; - auto const n = - pct_encode_bytes(s, {}, cs); + auto const n = pct_encode_bytes( + s, {}, detail::user_chars); auto dest = set_user_impl(n); pct_encode(dest, get(id_pass).data(), - s, {}, cs); + s, {}, detail::user_chars); decoded_[id_user] = s.size(); check_invariants(); return *this; @@ -451,11 +451,8 @@ set_encoded_user( s = buf.maybe_copy(s); check_invariants(); error_code ec; - static constexpr auto cs = - unreserved_chars + - subdelim_chars; auto const n = - validate_pct_encoding(s, ec, {}, cs); + validate_pct_encoding(s, ec, {}, detail::user_chars); if(ec.failed()) detail::throw_invalid_argument( BOOST_CURRENT_LOCATION); @@ -528,18 +525,15 @@ set_password(string_view s) this->string()); s = buf.maybe_copy(s); check_invariants(); - static constexpr auto cs = - unreserved_chars + - subdelim_chars + ':'; - auto const n = - pct_encode_bytes(s, {}, cs); + auto const n = pct_encode_bytes( + s, {}, detail::password_chars); auto dest = set_password_impl(n); pct_encode( dest, get(id_host).data() - 1, s, {}, - cs); + detail::password_chars); decoded_[id_pass] = s.size(); check_invariants(); return *this; @@ -554,12 +548,9 @@ set_encoded_password( this->string()); s = buf.maybe_copy(s); check_invariants(); - static constexpr auto cs = - unreserved_chars + - subdelim_chars + ':'; error_code ec; auto const n = - validate_pct_encoding(s, ec, {}, cs); + validate_pct_encoding(s, ec, {}, detail::password_chars); if(ec.failed()) detail::throw_invalid_argument( BOOST_CURRENT_LOCATION); @@ -623,18 +614,15 @@ set_userinfo( this->string()); s = buf.maybe_copy(s); check_invariants(); - static constexpr auto cs = - unreserved_chars + - subdelim_chars; - auto const n = - pct_encode_bytes(s, {}, cs); + auto const n = pct_encode_bytes( + s, {}, detail::userinfo_chars); auto dest = set_userinfo_impl(n); pct_encode( dest, get(id_host).data() - 1, s, {}, - cs); + detail::userinfo_chars); decoded_[id_user] = s.size(); check_invariants(); return *this; @@ -760,18 +748,15 @@ set_host( return set_host(r.value()); } check_invariants(); - static constexpr auto cs = - unreserved_chars + - subdelim_chars; - auto const n = - pct_encode_bytes(s, {}, cs); + auto const n = pct_encode_bytes( + s, {}, detail::host_chars); auto dest = set_host_impl(n); pct_encode( dest, get(id_path).data(), s, {}, - cs); + detail::host_chars); decoded_[id_host] = s.size(); host_type_ = urls::host_type::name; @@ -1801,17 +1786,15 @@ set_fragment( this->string()); s = buf.maybe_copy(s); check_invariants(); - static constexpr auto cs = - pchars + '/' + '?'; - auto const n = - pct_encode_bytes(s, {}, cs); + auto const n = pct_encode_bytes( + s, {}, fragment_chars); auto dest = set_fragment_impl(n); pct_encode( dest, get(id_end).data(), s, {}, - cs); + fragment_chars); decoded_[id_frag] = s.size(); check_invariants(); return *this; @@ -2042,26 +2025,167 @@ resolve( // //------------------------------------------------ -url& +void url:: -normalize() +normalize_octets_impl( + int id, + grammar::lut_chars const& cs) noexcept { -#if 0 - // scheme + char* it = s_ + offset(id); + char* end = s_ + offset(id + 1); + char buf = 0; + char* dest = it; + while (it < end) { - char* it = s_ - + offset(id_scheme); - auto last = it + offset(id_user); - if(it < last) + if (*it != '%') { - --last; - while(it != last) - { - // to upper - } + *dest = *it; + ++it; + ++dest; + continue; } + if (end - it < 3) + break; + + // decode unreserved octets + pct_decode_unchecked( + &buf, + &buf + 1, + string_view(it, 3)); + if (cs(buf)) + { + *dest = buf; + it += 3; + ++dest; + continue; + } + + // uppercase percent-encoding triplets + ++it; + *it = grammar::ascii_toupper(*it); + ++it; + *it = grammar::ascii_toupper(*it); + ++it; + dest += 3; } -#endif + if (it != dest) + { + std::size_t diff = it - dest; + std::size_t n = len(id) - diff; + shrink_impl(id, n); + s_[size()] = '\0'; + } +} + +void +url:: +decoded_to_lower_impl(int id) noexcept +{ + char* it = s_ + offset(id); + char const* const end = s_ + offset(id + 1); + while(it < end) + { + if (*it != '%') + { + *it = grammar::ascii_tolower( + *it); + ++it; + continue; + } + it += 3; + } +} + +void +url:: +to_lower_impl(int id) noexcept +{ + char* it = s_ + offset(id); + char const* const end = s_ + offset(id + 1); + while(it < end) + { + *it = grammar::ascii_tolower( + *it); + ++it; + } +} + +url& +url:: +normalize_scheme() +{ + to_lower_impl(id_scheme); + return *this; +} + +url& +url:: +normalize_authority() +{ + // normalize host + if (host_type() == urls::host_type::name) + { + normalize_octets_impl( + id_host, + detail::reg_name_chars); + } + decoded_to_lower_impl(id_host); + + // normalize password + normalize_octets_impl(id_pass, detail::password_chars); + + // normalize user + normalize_octets_impl(id_user, detail::user_chars); + return *this; +} + +url& +url:: +normalize_path() +{ + normalize_octets_impl(id_path, detail::path_chars); + string_view p = encoded_path(); + char* p_dest = s_ + offset(id_path); + char* p_end = s_ + offset(id_path + 1); + std::size_t pn = p.size(); + bool abs = is_path_absolute(); + std::size_t n = detail::remove_dot_segments( + p_dest, p_end, p, abs); + if (n != pn) + { + BOOST_ASSERT(n < pn); + shrink_impl(id_path, n); + nseg_ = std::count( + p.begin() + 1, p.end(), '/') + 1; + } + return *this; +} + +url& +url:: +normalize_query() +{ + normalize_octets_impl(id_query, query_chars); + return *this; +} + +url& +url:: +normalize_fragment() +{ + normalize_octets_impl(id_frag, fragment_chars); + return *this; +} + +url& +url:: +normalize() +{ + normalize_fragment(); + normalize_query(); + normalize_path(); + normalize_authority(); + normalize_scheme(); return *this; } @@ -2276,47 +2400,7 @@ resize_impl( return s_ + offset(first); if(new_len <= n0) { - // shrinking - std::size_t n = n0 - new_len; - auto const pos = - offset(last); - // adjust chars - std::memmove( - s_ + pos - n, - s_ + pos, - offset( - id_end) - pos + 1); - // collapse (first, last) - collapse(first, last, - offset(last) - n); - // shift (last, end) left - adjust( - last, id_end, 0 - n); -#if 0 - // update table - if( nseg > 1 && - first <= id_path) - { - // adjust segments - auto const tab = - tab_end() - 1; - for(std::size_t i = 0; - i < nseg - 1; ++i) - tab[0-2*i] += 0 - n; - } - if( nparam > 1 && - first <= id_query) - { - // adjust params - auto const tab = - tab_end() - 2; - for(std::size_t i = 0; - i < nparam - 1; ++i) - tab[0-2*i] += 0 - n; - } -#endif - s_[size()] = '\0'; - return s_ + offset(first); + return shrink_impl(first, last, new_len); } // growing @@ -2365,6 +2449,68 @@ resize_impl( return s_ + offset(first); } +char* +url:: +shrink_impl( + int id, + std::size_t new_size) +{ + return shrink_impl( + id, id + 1, new_size); +} + +char* +url:: +shrink_impl( + int first, + int last, + std::size_t new_len) +{ + // shrinking + auto const n0 = len(first, last); + BOOST_ASSERT(new_len <= n0); + std::size_t n = n0 - new_len; + auto const pos = + offset(last); + // adjust chars + std::memmove( + s_ + pos - n, + s_ + pos, + offset( + id_end) - pos + 1); + // collapse (first, last) + collapse(first, last, + offset(last) - n); + // shift (last, end) left + adjust( + last, id_end, 0 - n); +#if 0 + // update table + if( nseg > 1 && + first <= id_path) + { + // adjust segments + auto const tab = + tab_end() - 1; + for(std::size_t i = 0; + i < nseg - 1; ++i) + tab[0-2*i] += 0 - n; + } + if( nparam > 1 && + first <= id_query) + { + // adjust params + auto const tab = + tab_end() - 2; + for(std::size_t i = 0; + i < nparam - 1; ++i) + tab[0-2*i] += 0 - n; + } +#endif + s_[size()] = '\0'; + return s_ + offset(first); +} + //------------------------------------------------ std::ostream& diff --git a/include/boost/url/impl/url_view.ipp b/include/boost/url/impl/url_view.ipp index 7ba571e7..1b7742e5 100644 --- a/include/boost/url/impl/url_view.ipp +++ b/include/boost/url/impl/url_view.ipp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -13,7 +14,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -442,6 +445,64 @@ encoded_fragment() const noexcept return s.substr(1); } +//------------------------------------------------ +// +// Comparisons +// +//------------------------------------------------ + +int +url_view:: +compare(const url_view& other) const noexcept +{ + int comp = detail::ci_compare( + scheme(), + other.scheme()); + if ( comp != 0 ) + return comp; + + comp = detail::compare_encoded( + encoded_user(), + other.encoded_user()); + if ( comp != 0 ) + return comp; + + comp = detail::compare_encoded( + encoded_password(), + other.encoded_password()); + if ( comp != 0 ) + return comp; + + comp = detail::ci_compare_encoded( + encoded_host(), + other.encoded_host()); + if ( comp != 0 ) + return comp; + + comp = detail::normalized_path_compare( + encoded_path(), + other.encoded_path(), + is_path_absolute(), + other.is_path_absolute()); + if ( comp != 0 ) + return comp; + + comp = detail::compare_encoded( + encoded_query(), + other.encoded_query()); + if ( comp != 0 ) + return comp; + + comp = detail::compare_encoded( + encoded_fragment(), + other.encoded_fragment()); + if ( comp != 0 ) + return comp; + + return 0; +} + + //------------------------------------------------ // // Parsing diff --git a/include/boost/url/ipv4_address.hpp b/include/boost/url/ipv4_address.hpp index 985878ba..b906af15 100644 --- a/include/boost/url/ipv4_address.hpp +++ b/include/boost/url/ipv4_address.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -262,7 +263,6 @@ class ipv4_address /** Customization point for parsing an IPv4 address. */ - BOOST_URL_DECL friend void tag_invoke( diff --git a/include/boost/url/ipv6_address.hpp b/include/boost/url/ipv6_address.hpp index 11a01877..0c1d627a 100644 --- a/include/boost/url/ipv6_address.hpp +++ b/include/boost/url/ipv6_address.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -317,7 +318,6 @@ class ipv6_address @param t Set to the result upon success. */ - BOOST_URL_DECL friend void tag_invoke( diff --git a/include/boost/url/pct_encoding.hpp b/include/boost/url/pct_encoding.hpp index 6cf56cfb..2f03a213 100644 --- a/include/boost/url/pct_encoding.hpp +++ b/include/boost/url/pct_encoding.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -245,7 +246,7 @@ pct_decode( this parameter is omitted, the default options will be used. - @param cs An opitionally specified + @param cs An optionally specified character set to use. If this parameter is omitted, all characters are considered unreserved. diff --git a/include/boost/url/rfc/detail/charsets.hpp b/include/boost/url/rfc/detail/charsets.hpp new file mode 100644 index 00000000..d3db9aee --- /dev/null +++ b/include/boost/url/rfc/detail/charsets.hpp @@ -0,0 +1,112 @@ +// +// Copyright (c) 2022 alandefreitas (alandefreitas@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/CPPAlliance/url +// + +#ifndef BOOST_URL_RFC_DETAIL_CHARSETS_HPP +#define BOOST_URL_RFC_DETAIL_CHARSETS_HPP + +#include + +namespace boost { +namespace urls { +namespace detail { + +struct user_chars_t : grammar::lut_chars +{ + constexpr + user_chars_t() noexcept + : grammar::lut_chars( + unreserved_chars + + subdelim_chars) + { + } +}; + +constexpr user_chars_t user_chars{}; + +struct password_chars_t : grammar::lut_chars +{ + constexpr + password_chars_t() noexcept + : grammar::lut_chars( + unreserved_chars + + subdelim_chars + ':') + { + } +}; + +constexpr password_chars_t password_chars{}; + +struct userinfo_chars_t : grammar::lut_chars +{ + constexpr + userinfo_chars_t() noexcept + : grammar::lut_chars( + unreserved_chars + + subdelim_chars) + { + } +}; + +constexpr userinfo_chars_t userinfo_chars{}; + +struct host_chars_t : grammar::lut_chars +{ + constexpr + host_chars_t() noexcept + : grammar::lut_chars( + unreserved_chars + + subdelim_chars) + { + } +}; + +constexpr host_chars_t host_chars{}; + +struct reg_name_chars_t : grammar::lut_chars +{ + constexpr + reg_name_chars_t() noexcept + : grammar::lut_chars( + unreserved_chars + + '-' + '.') + { + } +}; + +constexpr reg_name_chars_t reg_name_chars{}; + +struct segment_chars_t : grammar::lut_chars +{ + constexpr + segment_chars_t() noexcept + : grammar::lut_chars( + pchars) + { + } +}; + +constexpr segment_chars_t segment_chars{}; + +struct path_chars_t : grammar::lut_chars +{ + constexpr + path_chars_t() noexcept + : grammar::lut_chars( + segment_chars + '/') + { + } +}; + +constexpr path_chars_t path_chars{}; + +} // detail +} // urls +} // boost + +#endif diff --git a/include/boost/url/rfc/reg_name_rule.hpp b/include/boost/url/rfc/reg_name_rule.hpp index bf9b4cd0..7fa93c14 100644 --- a/include/boost/url/rfc/reg_name_rule.hpp +++ b/include/boost/url/rfc/reg_name_rule.hpp @@ -36,7 +36,6 @@ struct reg_name_rule { pct_encoded_str v; - BOOST_URL_DECL friend void tag_invoke( diff --git a/include/boost/url/src.hpp b/include/boost/url/src.hpp index cd259801..8149afbd 100644 --- a/include/boost/url/src.hpp +++ b/include/boost/url/src.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -30,8 +31,10 @@ in a translation unit of the program. #include #include #include +#include #include #include +#include #include #include diff --git a/include/boost/url/url.hpp b/include/boost/url/url.hpp index d80b936c..4703c03c 100644 --- a/include/boost/url/url.hpp +++ b/include/boost/url/url.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -1073,7 +1074,10 @@ class BOOST_SYMBOL_VISIBLE url template> urls::segments - segments(Allocator const& = {}) noexcept; + segments(Allocator const& a = {}) noexcept + { + return urls::segments(*this, a); + } //-------------------------------------------- // @@ -1291,13 +1295,150 @@ class BOOST_SYMBOL_VISIBLE url // Normalization // //-------------------------------------------- +private: + void + normalize_octets_impl( + int id, + grammar::lut_chars const& cs) noexcept; + + void + decoded_to_lower_impl(int id) noexcept; + + void + to_lower_impl(int id) noexcept; +public: + + /** Normalize the URL components + + Applies Syntax-based normalization to + all components of the URL. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) - /** Normalize everything. */ BOOST_URL_DECL url& normalize(); + /** Normalize the URL scheme + + Applies Syntax-based normalization to the + URL scheme. + + The scheme is normalized to lowercase. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + + */ + BOOST_URL_DECL + url& + normalize_scheme(); + + /** Normalize the URL authority + + Applies Syntax-based normalization to the + URL authority. + + Percent-encoding triplets are normalized + to uppercase letters. Percent-encoded + octets that correspond to unreserved + characters are decoded. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + + */ + BOOST_URL_DECL + url& + normalize_authority(); + + /** Normalize the URL path + + Applies Syntax-based normalization to the + URL path. + + Percent-encoding triplets are normalized + to uppercase letters. Percent-encoded + octets that correspond to unreserved + characters are decoded. Redundant + path-segments are removed. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + + */ + BOOST_URL_DECL + url& + normalize_path(); + + /** Normalize the URL query + + Applies Syntax-based normalization to the + URL query. + + Percent-encoding triplets are normalized + to uppercase letters. Percent-encoded + octets that correspond to unreserved + characters are decoded. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + + */ + BOOST_URL_DECL + url& + normalize_query(); + + /** Normalize the URL fragment + + Applies Syntax-based normalization to the + URL fragment. + + Percent-encoding triplets are normalized + to uppercase letters. Percent-encoded + octets that correspond to unreserved + characters are decoded. + + @par Exception Safety + Strong guarantee. + Calls to allocate may throw. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + + */ + BOOST_URL_DECL + url& + normalize_fragment(); + //-------------------------------------------- // // Resolution @@ -1342,6 +1483,17 @@ class BOOST_SYMBOL_VISIBLE url int last, std::size_t new_size); + char* + shrink_impl( + int id, + std::size_t new_size); + + char* + shrink_impl( + int first, + int last, + std::size_t new_size); + BOOST_URL_DECL bool resolve( @@ -1417,6 +1569,5 @@ operator<<(std::ostream& os, url const& u); #include #include #include -#include #endif diff --git a/include/boost/url/url_view.hpp b/include/boost/url/url_view.hpp index 3f7a1280..41283fa2 100644 --- a/include/boost/url/url_view.hpp +++ b/include/boost/url/url_view.hpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -1700,6 +1701,30 @@ class BOOST_SYMBOL_VISIBLE url_view opt, a, decoded_[id_frag]); } + //-------------------------------------------- + // + // Comparison + // + //-------------------------------------------- + + /** Compare two URLs. + + This function compares two URLs + according to Syntax-Based comparison + algorithm. + + @par Exception Safety + Throws nothing. + + @par Specification + @li 6.2.2 Syntax-Based Normalization (rfc3986) + */ + BOOST_URL_DECL + int + compare(const url_view& other) const noexcept; + + //-------------------------------------------- // // Parsing @@ -1715,6 +1740,7 @@ class BOOST_SYMBOL_VISIBLE url_view BOOST_URL_DECL friend result parse_uri_reference(string_view s); + private: void apply(scheme_part_rule const& t) noexcept; void apply(host_rule const& h) noexcept; diff --git a/test/unit/url.cpp b/test/unit/url.cpp index cb00b412..1130b40c 100644 --- a/test/unit/url.cpp +++ b/test/unit/url.cpp @@ -1,5 +1,6 @@ // // Copyright (c) 2019 Vinnie Falco (vinnie.falco@gmail.com) +// Copyright (c) 2022 Alan Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -1809,6 +1810,121 @@ class url_test //-------------------------------------------- + void + testNormalize() + { + // normalize + { + auto check = [](string_view before, + string_view after) + { + url u1 = parse_uri(before).value(); + url_view u2 = parse_uri(after).value(); + BOOST_TEST(u1.compare(u2) == 0); + u1.normalize(); + BOOST_TEST(u1.string() == after); + }; + + check("HtTp://cPpAlLiAnCe.oRG/", + "http://cppalliance.org/"); + check("http://%2a%2b%2C%2f%3A.org/", + "http://%2A%2B%2C%2F%3A.org/"); + check("http://%63%70%70%61%6c%6Ci%61n%63e.org/", + "http://cppalliance.org/"); + check("http://%43%70%50%61%6c%6Ci%61n%43e.org/", + "http://cppalliance.org/"); + check("http://cppalliance.org/a/b/c/./../../g", + "http://cppalliance.org/a/g"); + check("http://cppalliance.org/aa/bb/cc/./../../gg", + "http://cppalliance.org/aa/gg"); + check("http://cppalliance.org/a/b/../../g", + "http://cppalliance.org/g"); + check("http://cppalliance.org/a/b/../../../g", + "http://cppalliance.org/g"); + check("http://cppalliance.org/..", + "http://cppalliance.org/"); + } + + // remove_dot_segments + { + auto check = [](string_view p, + string_view e) { + url u1 = parse_relative_ref(p).value(); + u1.normalize_path(); + BOOST_TEST(u1.encoded_path() == e); + url u2 = parse_relative_ref(e).value(); + BOOST_TEST(u1.compare(u2) == 0); + }; + + + check("/a/b/c/./../../g", "/a/g"); + check("/aa/bb/cc/./../../gg", "/aa/gg"); + check("../a/b/c/./../../g", "../a/g"); + check("./a/b/c/./../../g", "a/g"); + check(".././a/b/c/./../../g", "../a/g"); + check("%2E%2E/./a/b/c/./../../g", "../a/g"); + check("/a/b/../../g", "/g"); + check("/a/b/../../../g", "/g"); + check("mid/content=5/../6", "mid/6"); + check("mid/content=5/../6/.", "mid/6/"); + check("mid/content=5/../6/..", "mid/"); + check("/..", "/"); + check(".", ""); + check("..", ".."); + check("", ""); + } + + // inequality + { + auto check = [](string_view e1, + string_view e2, + int cmp) { + url_view u1 = parse_uri(e1).value(); + url_view u2 = parse_uri(e2).value(); + BOOST_TEST(u1.compare(u2) == cmp); + BOOST_TEST(u2.compare(u1) == -cmp); + }; + + check("http://cppalliance.org", "https://cppalliance.org", -1); + check("https://cppalliance.org", "httpz://cppalliance.org", -1); + check("http://boost.org", "http://cppalliance.org", -1); + check("http://boost.orgg", "http://boost.org", +1); + check("http://cppalliance.org/%2E%2E/./b/b/c/./../../g", "http://cppalliance.org/../a/g", +1); + check("http://cppalliance.org?l=v", "http://cppalliance.org?k=v", 1); + check("http://cppalliance.org?%6C=v", "http://cppalliance.org?k=v", 1); + check("http://cppalliance.org#frag", "http://cppalliance.org#glob", -1); + check("http://cppalliance.org#fra", "http://cppalliance.org#frag", -1); + check("http://cppalliance.org#frag", "http://cppalliance.org#fra", 1); + } + + // path inequality + { + auto check = [](string_view e1, + string_view e2, + int cmp) { + url_view u1 = parse_relative_ref(e1).value(); + url_view u2 = parse_relative_ref(e2).value(); + BOOST_TEST(u1.compare(u2) == cmp); + BOOST_TEST(u2.compare(u1) == -cmp); + }; + + check("a/g", "/../g", 1); + check("./a/b/c/./../../g", "/a/b/../../../g", 1); + check("%2E/a/b/c/./../../g", "/a/b/../../../g", 1); + check("/../g", "a/g", -1); + check("/a/b/../../../g", "./a/b/c/./../../g", -1); + check("../g", "a/g", -1); + check("a/b/../../../g", "./a/b/c/./../../g", -1); + check("a/b/../../../%67", "./a/b/c/./../../g", -1); + check("/aa/g", "/aa/gg", -1); + check("../a/b", "..%2Fa/b", 1); + check("../a/b", "%2E%2E%2Fa/b", 1); + check("../a/b", "%2E%2E/a/b", 0); + } + } + + //-------------------------------------------- + void run() { @@ -1828,6 +1944,7 @@ class url_test testSegments(); testResolution(); testOstream(); + testNormalize(); } };