Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(balancer) async initialization of balancer objects #3187

Merged
merged 2 commits into from
Feb 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 55 additions & 7 deletions kong/core/balancer.lua
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ local table_concat = table.concat
local crc32 = ngx.crc32_short
local toip = dns_client.toip
local log = ngx.log
local sleep = ngx.sleep
local min = math.min
local max = math.max

local CRIT = ngx.CRIT
local ERR = ngx.ERR
local WARN = ngx.WARN
local DEBUG = ngx.DEBUG
Expand Down Expand Up @@ -289,9 +293,49 @@ do
end
end

local creating = {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we scope this and other create_balancer dependencies under a do ... end block? It'd be a little bit saner/easier to skim through imho

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is already inside a do-block and far down so it's only visible to the two relevant functions that need it. Can we leave this one as-is?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I missed it; quite large for do block, but that's fine!


local function wait(id)
local timeout = 30
local step = 0.001
local ratio = 2
local max_step = 0.5
while timeout > 0 do
sleep(step)
timeout = timeout - step
if not creating[id] then
return true
end
if timeout <= 0 then
break
end
step = min(max(0.001, step * ratio), timeout, max_step)
end
return nil, "timeout"
end

------------------------------------------------------------------------------
-- @param upstream (table) The upstream data
-- @param recreate (boolean, optional) create new balancer even if one exists
-- @param history (table, optional) history of target updates
-- @param start (integer, optional) from where to start reading the history
-- @return The new balancer object, or nil+error
create_balancer = function(upstream, history, start)
create_balancer = function(upstream, recreate, history, start)

if balancers[upstream.id] and not recreate then
return balancers[upstream.id]
end

if creating[upstream.id] then
local ok = wait(upstream.id)
if not ok then
return nil, "timeout waiting for balancer for " .. upstream.id
end
return balancers[upstream.id]
end

creating[upstream.id] = true

local balancer, err = ring_balancer.new({
wheelSize = upstream.slots,
order = upstream.orderlist,
Expand Down Expand Up @@ -319,6 +363,8 @@ do
-- is fully set up.
balancers[upstream.id] = balancer

creating[upstream.id] = nil

return balancer
end
end
Expand Down Expand Up @@ -374,7 +420,7 @@ local function check_target_history(upstream, balancer)

stop_healthchecker(balancer)

local new_balancer, err = create_balancer(upstream, new_history, 1)
local new_balancer, err = create_balancer(upstream, true, new_history, 1)
if not new_balancer then
return nil, err
end
Expand Down Expand Up @@ -470,7 +516,7 @@ local function get_balancer(target, no_create)
return nil, "balancer not found"
else
log(ERR, "balancer not found for ", upstream.name, ", will create it")
return create_balancer(upstream)
return create_balancer(upstream), upstream
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dammit, seems like this already was an issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it wasn't -- I mean, it was with regard to the function signature but it didn't happen because the get_balancer -> create_balancer code path was never hit when all balancers were created at startup. Now that it is async and get_balancer can run before init finishes running, then this code path can happen.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My one concern here is that this can be confusing for users of get_balancer and increase cognitive load: create_balancer can return 1 or 2 values, making get_balancer return 1, 2, or 3 values. Usually, we deal with this by catching errors like:

local balancer, err = create_balancer(upstream)
if not balancer then
  return nil, err
end

return balancer, upstream -- still somewhat annoying to me, I prefer `balancer, nil, upstream` but up to you!

end
end

Expand Down Expand Up @@ -523,7 +569,7 @@ local function on_upstream_event(operation, upstream)

local _, err = create_balancer(upstream)
if err then
log(ERR, "failed creating balancer for ", upstream.name, ": ", err)
log(CRIT, "failed creating balancer for ", upstream.name, ": ", err)
end

elseif operation == "delete" or operation == "update" then
Expand All @@ -540,7 +586,7 @@ local function on_upstream_event(operation, upstream)
if operation == "delete" then
balancers[upstream.id] = nil
else
local _, err = create_balancer(upstream)
local _, err = create_balancer(upstream, true)
if err then
log(ERR, "failed recreating balancer for ", upstream.name, ": ", err)
end
Expand Down Expand Up @@ -603,9 +649,10 @@ end


local function init()

local upstreams, err = get_all_upstreams()
if not upstreams then
log(ngx.STDERR, "failed loading initial list of upstreams: ", err)
log(CRIT, "failed loading initial list of upstreams: " .. err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor, but better not make use of Lua-land string concatenation in those error messages, and rely on the variadic arguments of ngx.log instead

return
end

Expand All @@ -616,11 +663,12 @@ local function init()
if ok ~= nil then
oks = oks + 1
else
log(ngx.STDERR, "failed creating balancer for ", name, ": ", err)
log(CRIT, "failed creating balancer for ", name, ": ", err)
errs = errs + 1
end
end
log(DEBUG, "initialized ", oks, " balancer(s), ", errs, " error(s)")

end


Expand Down
13 changes: 7 additions & 6 deletions kong/core/handler.lua
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,6 @@ return {
local cluster_events = singletons.cluster_events


-- initialize balancers


balancer.init()


-- events dispatcher


Expand Down Expand Up @@ -303,6 +297,12 @@ return {
end
end)


-- initialize balancers for active healthchecks
ngx.timer.at(0, function()
balancer.init()
end)

end
},
certificate = {
Expand All @@ -320,6 +320,7 @@ return {
},
access = {
before = function(ctx)

-- ensure router is up-to-date

local version, err = singletons.cache:get("router:version", {
Expand Down
29 changes: 29 additions & 0 deletions spec/01-unit/011-balancer_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,35 @@ describe("Balancer", function()
assert.truthy(hc)
hc:stop()
end)

it("reuses a balancer by default", function()
local b1 = balancer._create_balancer(UPSTREAMS_FIXTURES[1])
assert.truthy(b1)
local hc1 = balancer._get_healthchecker(b1)
local b2 = balancer._create_balancer(UPSTREAMS_FIXTURES[1])
assert.equal(b1, b2)
assert(hc1:stop())
end)

it("re-creates a balancer if told to", function()
local b1 = balancer._create_balancer(UPSTREAMS_FIXTURES[1], true)
assert.truthy(b1)
local hc1 = balancer._get_healthchecker(b1)
assert(hc1:stop())
local b2 = balancer._create_balancer(UPSTREAMS_FIXTURES[1], true)
assert.truthy(b1)
local hc2 = balancer._get_healthchecker(b2)
assert(hc2:stop())
local target_history = {
{ name = "mashape.com", port = 80, order = "001:a3", weight = 10 },
{ name = "mashape.com", port = 80, order = "002:a2", weight = 10 },
{ name = "mashape.com", port = 80, order = "002:a4", weight = 10 },
{ name = "mashape.com", port = 80, order = "003:a1", weight = 10 },
}
assert.not_same(b1, b2)
assert.same(target_history, balancer._get_target_history(b1))
assert.same(target_history, balancer._get_target_history(b2))
end)
end)

describe("get_balancer()", function()
Expand Down
77 changes: 75 additions & 2 deletions spec/02-integration/05-proxy/09-balancer_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,14 @@ local function client_requests(n, headers)
["Host"] = "balancer.test"
}
}
if res.status == 200 then
if not res then
fails = fails + 1
elseif res.status == 200 then
oks = oks + 1
elseif res.status > 399 then
fails = fails + 1
end
last_status = res.status
last_status = res and res.status
client:close()
end
return oks, fails, last_status
Expand Down Expand Up @@ -830,6 +832,77 @@ dao_helpers.for_each_dao(function(kong_config)
end
end)

it("perform active health checks -- can detect before any proxy traffic", function()

local healthcheck_interval = 0.2

local nfails = 2

-- configure healthchecks
local api_client = helpers.admin_client()
assert(api_client:send {
method = "PATCH",
path = "/upstreams/" .. upstream.name,
headers = {
["Content-Type"] = "application/json",
},
body = {
healthchecks = healthchecks_config {
active = {
http_path = "/status",
healthy = {
interval = healthcheck_interval,
successes = 1,
},
unhealthy = {
interval = healthcheck_interval,
http_failures = nfails,
tcp_failures = nfails,
},
}
}
},
})
api_client:close()

local timeout = 2.5
local requests = upstream.slots * 2 -- go round the balancer twice

-- setup target servers:
-- server1 will respond all requests, server2 will timeout
local server1 = http_server(timeout, localhost, PORT, { requests })
local server2 = http_server(timeout, localhost, PORT + 1, { requests })

-- server2 goes unhealthy before the first request
direct_request(localhost, PORT + 1, "/unhealthy")

-- restart Kong
helpers.stop_kong(nil, true, true)
helpers.start_kong()

-- Give time for healthchecker to detect
ngx.sleep(0.5 + (2 + nfails) * healthcheck_interval)

-- Phase 1: server1 takes all requests
local client_oks, client_fails = client_requests(requests)

helpers.stop_kong(nil, true, true)

-- collect server results; hitcount
local _, ok1, fail1 = server1:join()
local _, ok2, fail2 = server2:join()

-- verify
assert.are.equal(requests, ok1)
assert.are.equal(0, ok2)
assert.are.equal(0, fail1)
assert.are.equal(0, fail2)

assert.are.equal(requests, client_oks)
assert.are.equal(0, client_fails)

end)

it("perform passive health checks -- manual recovery", function()

for nfails = 1, 5 do
Expand Down
6 changes: 4 additions & 2 deletions spec/helpers.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1007,11 +1007,13 @@ return {

return kong_exec("start --conf " .. TEST_CONF_PATH .. nginx_conf, env)
end,
stop_kong = function(prefix, preserve_prefix)
stop_kong = function(prefix, preserve_prefix, preserve_tables)
prefix = prefix or conf.prefix
local ok, err = kong_exec("stop --prefix " .. prefix)
wait_pid(conf.nginx_pid, nil)
dao:truncate_tables()
if not preserve_tables then
dao:truncate_tables()
end
if not preserve_prefix then
clean_prefix(prefix)
end
Expand Down