Skip to content

Commit

Permalink
fix(tools.uri) normalization decodes as much as possible (#8140)
Browse files Browse the repository at this point in the history
We decide to let `normalize` function to decode URL-encoded string as much as possible.

**PLEASE REFERER TO**: #8140 (comment)

### Issues resolved

Fix #7913, FTI-2904

Outdated discussion:

----------------------

This is alternative to PR #8139 where we actually fix the normalization function to not do excessive percent-decoding on normalization.

When we added normalization kong.tools.uri.normalize, that function does percent-decoding on everything, except for the reserved characters.

That means that we basically percent-decode more than just the ranges of ALPHA (%41–%5A and %61–%7A), DIGIT (%30–%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E). (so called Unreserved Characters)

Alternative Implementation: See #8139
  • Loading branch information
bungle authored Jul 26, 2022
1 parent 8b99ad7 commit b7c082e
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 43 deletions.
97 changes: 55 additions & 42 deletions kong/tools/uri.lua
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
local _M = {}

local table_new = require "table.new"

local string_char = string.char
local string_upper = string.upper
Expand All @@ -12,52 +11,71 @@ local table_concat = table.concat
local ngx_re_find = ngx.re.find
local ngx_re_gsub = ngx.re.gsub

-- Charset:
-- reserved = "!" / "*" / "'" / "(" / ")" / ";" / ":" /
-- "@" / "&" / "=" / "+" / "$" / "," / "/" / "?" / "%" / "#" / "[" / "]"
-- unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
-- other: * (meaning any char that is not mentioned above)

-- Reserved characters have special meaning in URI. Encoding/decoding it affects the semantics of the URI;
-- Unreserved characters are safe to use as part of HTTP message without encoding;
-- Other characters has not special meaning but may be not safe to use as part of HTTP message without encoding;

-- We should not unescape or escape reserved characters;
-- We should unescape but not escape unreserved characters;
-- We choose to unescape when processing and escape when forwarding for other characters

local RESERVED_CHARS = "!*'();:@&=+$,/?%#[]"

local chars_to_decode = table_new(256, 0)
do
-- reserved
for i = 1, #RESERVED_CHARS do
chars_to_decode[string_byte(RESERVED_CHARS, i)] = true
end

-- unreserved and others default to nil
end

local RESERVED_CHARACTERS = {
[0x21] = true, -- !
[0x23] = true, -- #
[0x24] = true, -- $
[0x25] = true, -- %
[0x26] = true, -- &
[0x27] = true, -- '
[0x28] = true, -- (
[0x29] = true, -- )
[0x2A] = true, -- *
[0x2B] = true, -- +
[0x2C] = true, -- ,
[0x2F] = true, -- /
[0x3A] = true, -- :
[0x3B] = true, -- ;
[0x3D] = true, -- =
[0x3F] = true, -- ?
[0x40] = true, -- @
[0x5B] = true, -- [
[0x5D] = true, -- ]
}

local ESCAPE_PATTERN = "[^!#$&'()*+,/:;=?@[\\]A-Z\\d-_.~%]"

local TMP_OUTPUT = require("table.new")(16, 0)
local DOT = string_byte(".")
local SLASH = string_byte("/")

local function percent_decode(m)
local hex = m[1]
local num = tonumber(hex, 16)
if RESERVED_CHARACTERS[num] then
return string_upper(m[0])
end

local function normalize_decode(m)
local hex = m[1]
local num = tonumber(hex, 16)

-- from rfc3986 we should decode unreserved character
-- and we choose to decode "others"
if not chars_to_decode[num] then -- is not reserved(false or nil)
return string_char(num)
end

return string_upper(m[0])
end


local function escape(m)
local function percent_escape(m)
return string_format("%%%02X", string_byte(m[0]))
end

-- This function does slightly different things from its name.
-- It ensures the output to be safe to a part of HTTP message (headers or path)
-- and preserve origin semantics
local function escape(uri)
if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then
return (ngx_re_gsub(uri, ESCAPE_PATTERN, percent_escape, "joi"))
end

return uri
end

function _M.normalize(uri, merge_slashes)

local function normalize(uri, merge_slashes)
-- check for simple cases and early exit
if uri == "" or uri == "/" then
return uri
Expand All @@ -67,11 +85,12 @@ function _M.normalize(uri, merge_slashes)
-- (this can in some cases lead to unnecessary percent-decoding)
if string_find(uri, "%", 1, true) then
-- decoding percent-encoded triplets of unreserved characters
uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", percent_decode, "joi")
uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", normalize_decode, "joi")
end

-- check if the uri contains a dot
-- (this can in some cases lead to unnecessary dot removal processing)
-- notice: it's expected that /%2e./ is considered the same of /../
if string_find(uri, ".", 1, true) == nil then
if not merge_slashes then
return uri
Expand Down Expand Up @@ -148,13 +167,7 @@ function _M.normalize(uri, merge_slashes)
end


function _M.escape(uri)
if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then
return ngx_re_gsub(uri, ESCAPE_PATTERN, escape, "joi")
end

return uri
end


return _M
return {
escape = escape,
normalize = normalize,
}
40 changes: 40 additions & 0 deletions spec/01-unit/08-router_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2789,6 +2789,46 @@ for _, flavor in ipairs({ "traditional", "traditional_compatible" }) do
assert.equal(2, #match_t.matches.uri_captures)
end)

it("returns uri_captures normalized, fix #7913", function()
local use_case = {
{
service = service,
route = {
id = "e8fb37f1-102d-461e-9c51-6608a6bb8101",
paths = { [[~/users/(?P<fullname>[a-zA-Z\s\d%]+)/profile/?(?P<scope>[a-z]*)]] },
},
},
}


local router = assert(Router.new(use_case))
local _ngx = mock_ngx("GET", "/users/%6aohn%20doe/profile", { host = "domain.org" })

router._set_ngx(_ngx)
local match_t = router:exec()
assert.equal("john doe", match_t.matches.uri_captures[1])
assert.equal("john doe", match_t.matches.uri_captures.fullname)
assert.equal("", match_t.matches.uri_captures[2])
assert.equal("", match_t.matches.uri_captures.scope)
-- returns the full match as well
assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0])
-- no stripped_uri capture
assert.is_nil(match_t.matches.uri_captures.stripped_uri)
assert.equal(2, #match_t.matches.uri_captures)

-- again, this time from the LRU cache
local match_t = router:exec()
assert.equal("john doe", match_t.matches.uri_captures[1])
assert.equal("john doe", match_t.matches.uri_captures.fullname)
assert.equal("", match_t.matches.uri_captures[2])
assert.equal("", match_t.matches.uri_captures.scope)
-- returns the full match as well
assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0])
-- no stripped_uri capture
assert.is_nil(match_t.matches.uri_captures.stripped_uri)
assert.equal(2, #match_t.matches.uri_captures)
end)

it("returns no uri_captures from a [uri prefix] match", function()
local use_case = {
{
Expand Down
6 changes: 5 additions & 1 deletion spec/01-unit/18-tools_uri_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ describe("kong.tools.uri", function()

it("no normalization necessary (reserved characters)", function()
assert.equal("/a%2Fb%2Fc/", uri.normalize("/a%2Fb%2Fc/"))
assert.equal("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"))
assert.equal("/ %21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%20%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"))
end)

it("converting percent-encoded triplets to uppercase", function()
Expand Down Expand Up @@ -50,6 +50,10 @@ describe("kong.tools.uri", function()
it("decodes non-ASCII characters that are unreserved, issue #2366", function()
assert.equal("/endeløst", uri.normalize("/endel%C3%B8st"))
end)

it("does normalize complex uri that has characters outside of normal uri charset", function()
assert.equal("/ä/a./a/_\x99\xaf%2F%2F" , uri.normalize("/%C3%A4/a/%2e./a%2E//a/%2e/./a/../a/%2e%2E/%5f%99%af%2f%2F", true))
end)
end)

describe("escape()", function()
Expand Down

0 comments on commit b7c082e

Please sign in to comment.