From b7c082e8a44f1e8fb0f89b17811ea039abaf5c6b Mon Sep 17 00:00:00 2001 From: Aapo Talvensaari Date: Tue, 26 Jul 2022 06:58:44 +0300 Subject: [PATCH] fix(tools.uri) normalization decodes as much as possible (#8140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We decide to let `normalize` function to decode URL-encoded string as much as possible. **PLEASE REFERER TO**: https://github.com/Kong/kong/pull/8140#issuecomment-1162850953 ### Issues resolved Fix #7913, FTI-2904 Outdated discussion: ---------------------- This is alternative to PR #8139 where we actually fix the normalization function to not do excessive percent-decoding on normalization. When we added normalization kong.tools.uri.normalize, that function does percent-decoding on everything, except for the reserved characters. That means that we basically percent-decode more than just the ranges of ALPHA (%41–%5A and %61–%7A), DIGIT (%30–%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E). (so called Unreserved Characters) Alternative Implementation: See #8139 --- kong/tools/uri.lua | 97 +++++++++++++++++------------- spec/01-unit/08-router_spec.lua | 40 ++++++++++++ spec/01-unit/18-tools_uri_spec.lua | 6 +- 3 files changed, 100 insertions(+), 43 deletions(-) diff --git a/kong/tools/uri.lua b/kong/tools/uri.lua index 2acdc0a351c..ecc599199ea 100644 --- a/kong/tools/uri.lua +++ b/kong/tools/uri.lua @@ -1,5 +1,4 @@ -local _M = {} - +local table_new = require "table.new" local string_char = string.char local string_upper = string.upper @@ -12,28 +11,32 @@ local table_concat = table.concat local ngx_re_find = ngx.re.find local ngx_re_gsub = ngx.re.gsub +-- Charset: +-- reserved = "!" / "*" / "'" / "(" / ")" / ";" / ":" / +-- "@" / "&" / "=" / "+" / "$" / "," / "/" / "?" / "%" / "#" / "[" / "]" +-- unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +-- other: * (meaning any char that is not mentioned above) + +-- Reserved characters have special meaning in URI. Encoding/decoding it affects the semantics of the URI; +-- Unreserved characters are safe to use as part of HTTP message without encoding; +-- Other characters has not special meaning but may be not safe to use as part of HTTP message without encoding; + +-- We should not unescape or escape reserved characters; +-- We should unescape but not escape unreserved characters; +-- We choose to unescape when processing and escape when forwarding for other characters + +local RESERVED_CHARS = "!*'();:@&=+$,/?%#[]" + +local chars_to_decode = table_new(256, 0) +do + -- reserved + for i = 1, #RESERVED_CHARS do + chars_to_decode[string_byte(RESERVED_CHARS, i)] = true + end + + -- unreserved and others default to nil +end -local RESERVED_CHARACTERS = { - [0x21] = true, -- ! - [0x23] = true, -- # - [0x24] = true, -- $ - [0x25] = true, -- % - [0x26] = true, -- & - [0x27] = true, -- ' - [0x28] = true, -- ( - [0x29] = true, -- ) - [0x2A] = true, -- * - [0x2B] = true, -- + - [0x2C] = true, -- , - [0x2F] = true, -- / - [0x3A] = true, -- : - [0x3B] = true, -- ; - [0x3D] = true, -- = - [0x3F] = true, -- ? - [0x40] = true, -- @ - [0x5B] = true, -- [ - [0x5D] = true, -- ] -} local ESCAPE_PATTERN = "[^!#$&'()*+,/:;=?@[\\]A-Z\\d-_.~%]" @@ -41,23 +44,38 @@ local TMP_OUTPUT = require("table.new")(16, 0) local DOT = string_byte(".") local SLASH = string_byte("/") -local function percent_decode(m) - local hex = m[1] - local num = tonumber(hex, 16) - if RESERVED_CHARACTERS[num] then - return string_upper(m[0]) - end +local function normalize_decode(m) + local hex = m[1] + local num = tonumber(hex, 16) + + -- from rfc3986 we should decode unreserved character + -- and we choose to decode "others" + if not chars_to_decode[num] then -- is not reserved(false or nil) return string_char(num) + end + + return string_upper(m[0]) end -local function escape(m) +local function percent_escape(m) return string_format("%%%02X", string_byte(m[0])) end +-- This function does slightly different things from its name. +-- It ensures the output to be safe to a part of HTTP message (headers or path) +-- and preserve origin semantics +local function escape(uri) + if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then + return (ngx_re_gsub(uri, ESCAPE_PATTERN, percent_escape, "joi")) + end + + return uri +end -function _M.normalize(uri, merge_slashes) + +local function normalize(uri, merge_slashes) -- check for simple cases and early exit if uri == "" or uri == "/" then return uri @@ -67,11 +85,12 @@ function _M.normalize(uri, merge_slashes) -- (this can in some cases lead to unnecessary percent-decoding) if string_find(uri, "%", 1, true) then -- decoding percent-encoded triplets of unreserved characters - uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", percent_decode, "joi") + uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", normalize_decode, "joi") end -- check if the uri contains a dot -- (this can in some cases lead to unnecessary dot removal processing) + -- notice: it's expected that /%2e./ is considered the same of /../ if string_find(uri, ".", 1, true) == nil then if not merge_slashes then return uri @@ -148,13 +167,7 @@ function _M.normalize(uri, merge_slashes) end -function _M.escape(uri) - if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then - return ngx_re_gsub(uri, ESCAPE_PATTERN, escape, "joi") - end - - return uri -end - - -return _M +return { + escape = escape, + normalize = normalize, +} diff --git a/spec/01-unit/08-router_spec.lua b/spec/01-unit/08-router_spec.lua index 56adb4cf100..b27481fc983 100644 --- a/spec/01-unit/08-router_spec.lua +++ b/spec/01-unit/08-router_spec.lua @@ -2789,6 +2789,46 @@ for _, flavor in ipairs({ "traditional", "traditional_compatible" }) do assert.equal(2, #match_t.matches.uri_captures) end) + it("returns uri_captures normalized, fix #7913", function() + local use_case = { + { + service = service, + route = { + id = "e8fb37f1-102d-461e-9c51-6608a6bb8101", + paths = { [[~/users/(?P[a-zA-Z\s\d%]+)/profile/?(?P[a-z]*)]] }, + }, + }, + } + + + local router = assert(Router.new(use_case)) + local _ngx = mock_ngx("GET", "/users/%6aohn%20doe/profile", { host = "domain.org" }) + + router._set_ngx(_ngx) + local match_t = router:exec() + assert.equal("john doe", match_t.matches.uri_captures[1]) + assert.equal("john doe", match_t.matches.uri_captures.fullname) + assert.equal("", match_t.matches.uri_captures[2]) + assert.equal("", match_t.matches.uri_captures.scope) + -- returns the full match as well + assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0]) + -- no stripped_uri capture + assert.is_nil(match_t.matches.uri_captures.stripped_uri) + assert.equal(2, #match_t.matches.uri_captures) + + -- again, this time from the LRU cache + local match_t = router:exec() + assert.equal("john doe", match_t.matches.uri_captures[1]) + assert.equal("john doe", match_t.matches.uri_captures.fullname) + assert.equal("", match_t.matches.uri_captures[2]) + assert.equal("", match_t.matches.uri_captures.scope) + -- returns the full match as well + assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0]) + -- no stripped_uri capture + assert.is_nil(match_t.matches.uri_captures.stripped_uri) + assert.equal(2, #match_t.matches.uri_captures) + end) + it("returns no uri_captures from a [uri prefix] match", function() local use_case = { { diff --git a/spec/01-unit/18-tools_uri_spec.lua b/spec/01-unit/18-tools_uri_spec.lua index b9d9dc910aa..cb7cede521d 100644 --- a/spec/01-unit/18-tools_uri_spec.lua +++ b/spec/01-unit/18-tools_uri_spec.lua @@ -14,7 +14,7 @@ describe("kong.tools.uri", function() it("no normalization necessary (reserved characters)", function() assert.equal("/a%2Fb%2Fc/", uri.normalize("/a%2Fb%2Fc/")) - assert.equal("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D")) + assert.equal("/ %21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%20%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D")) end) it("converting percent-encoded triplets to uppercase", function() @@ -50,6 +50,10 @@ describe("kong.tools.uri", function() it("decodes non-ASCII characters that are unreserved, issue #2366", function() assert.equal("/endeløst", uri.normalize("/endel%C3%B8st")) end) + + it("does normalize complex uri that has characters outside of normal uri charset", function() + assert.equal("/ä/a./a/_\x99\xaf%2F%2F" , uri.normalize("/%C3%A4/a/%2e./a%2E//a/%2e/./a/../a/%2e%2E/%5f%99%af%2f%2F", true)) + end) end) describe("escape()", function()