Skip to content

Commit

Permalink
feat(healthcheck) delayed_clear function
Browse files Browse the repository at this point in the history
Added new function delayed_clear. This function marks all targets to be
removed, but do not actually remove them. If before the delay parameter
any of them is re-added, it is unmarked for removal.

This function makes it possible to keep target state during config
changes, where the targets might be removed and then re-added.

Fixed report_* functions that were not using the same pattern for
hostnames. Now when hostname is empty, it's always replaced by the
IP.
  • Loading branch information
locao committed Feb 1, 2022
1 parent 34d6cad commit 9a78b3e
Show file tree
Hide file tree
Showing 9 changed files with 288 additions and 123 deletions.
87 changes: 70 additions & 17 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ local ngx_log = ngx.log
local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local table_insert = table.insert
local table_remove = table.remove
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
Expand Down Expand Up @@ -281,20 +282,25 @@ end
function checker:add_target(ip, port, hostname, is_healthy, hostheader)
ip = tostring(assert(ip, "no ip address provided"))
port = assert(tonumber(port), "no port number provided")
hostname = hostname or ip
if is_healthy == nil then
is_healthy = true
end

local internal_health = is_healthy and "healthy" or "unhealthy"

local ok, err = locking_target_list(self, function(target_list)
local found = false

-- check whether we already have this target
for _, target in ipairs(target_list) do
if target.ip == ip and target.port == port and target.hostname == hostname then
self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip,
":", port, " (ignoring)")
return false
target.purge_time = nil
found = true
internal_health = self:get_target_status(ip, port, hostname) and
"healthy" or "unhealthy"
end
end

Expand All @@ -308,12 +314,14 @@ function checker:add_target(ip, port, hostname, is_healthy, hostheader)
end

-- target does not exist, go add it
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
if not found then
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
end
target_list = serialize(target_list)

ok, err = self.shm:set(self.TARGET_LIST, target_list)
Expand Down Expand Up @@ -433,6 +441,28 @@ function checker:clear()
end


function checker:delayed_clear(delay)
assert(tonumber(delay), "no delay provided")

return locking_target_list(self, function(target_list)
local purge_time = ngx_now() + delay

-- add purge time to all targets
for _, target in ipairs(target_list) do
target.purge_time = purge_time
end

target_list = serialize(target_list)
local ok, err = self.shm:set(self.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

return true
end)
end


--- Get the current status of the target.
-- @param ip IP address of the target being checked.
-- @param port the port being checked against.
Expand Down Expand Up @@ -629,7 +659,7 @@ function checker:report_failure(ip, port, hostname, check)
ctr_type = CTR_HTTP
end

return incr_counter(self, "unhealthy", ip, port, hostname, limit, ctr_type)
return incr_counter(self, "unhealthy", ip, port, hostname or ip, limit, ctr_type)

end

Expand All @@ -648,7 +678,7 @@ function checker:report_success(ip, port, hostname, check)

local limit = self.checks[check or "passive"].healthy.successes

return incr_counter(self, "healthy", ip, port, hostname, limit, CTR_SUCCESS)
return incr_counter(self, "healthy", ip, port, hostname or ip, limit, CTR_SUCCESS)

end

Expand Down Expand Up @@ -686,7 +716,7 @@ function checker:report_http_status(ip, port, hostname, http_status, check)
return
end

return incr_counter(self, status_type, ip, port, hostname, limit, ctr)
return incr_counter(self, status_type, ip, port, hostname or ip, limit, ctr)

end

Expand All @@ -708,7 +738,7 @@ function checker:report_tcp_failure(ip, port, hostname, operation, check)
local limit = self.checks[check or "passive"].unhealthy.tcp_failures

-- TODO what do we do with the `operation` information
return incr_counter(self, "unhealthy", ip, port, hostname, limit, CTR_TCP)
return incr_counter(self, "unhealthy", ip, port, hostname or ip, limit, CTR_TCP)

end

Expand All @@ -725,7 +755,7 @@ function checker:report_timeout(ip, port, hostname, check)

local limit = self.checks[check or "passive"].unhealthy.timeouts

return incr_counter(self, "unhealthy", ip, port, hostname, limit, CTR_TIMEOUT)
return incr_counter(self, "unhealthy", ip, port, hostname or ip, limit, CTR_TIMEOUT)

end

Expand Down Expand Up @@ -766,6 +796,7 @@ end
function checker:set_target_status(ip, port, hostname, is_healthy)
ip = tostring(assert(ip, "no ip address provided"))
port = assert(tonumber(port), "no port number provided")
hostname = hostname or ip
assert(type(is_healthy) == "boolean")

local health_report = is_healthy and "healthy" or "unhealthy"
Expand Down Expand Up @@ -1085,7 +1116,7 @@ function checker:event_handler(event_name, ip, port, hostname)
then
if not target_found then
-- it is a new target, must add it first
target_found = { ip = ip, port = port, hostname = hostname }
target_found = { ip = ip, port = port, hostname = hostname or ip }
self.targets[target_found.ip] = self.targets[target_found.ip] or {}
self.targets[target_found.ip][target_found.port] = self.targets[target_found.ip][target_found.port] or {}
self.targets[target_found.ip][target_found.port][target_found.hostname or ip] = target_found
Expand Down Expand Up @@ -1126,7 +1157,7 @@ end

-- Raises an event for a target status change.
function checker:raise_event(event_name, ip, port, hostname)
local target = { ip = ip, port = port, hostname = hostname }
local target = { ip = ip, port = port, hostname = hostname or ip}
worker_events.post(self.EVENT_SOURCE, event_name, target)
end

Expand Down Expand Up @@ -1451,9 +1482,8 @@ function _M.new(opts)
return nil, err
end

-- if active checker is needed and not running, start it
if (self.checks.active.healthy.active or self.checks.active.unhealthy.active)
and active_check_timer == nil then
-- if active checker is not running, start it
if active_check_timer == nil then

self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
"starting active check timer")
Expand All @@ -1475,6 +1505,29 @@ function _M.new(opts)

local cur_time = ngx_now()
for _, checker_obj in ipairs(hcs) do
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
for i, target in ipairs(target_list) do
if target.purge_time and target.purge_time <= cur_time then
table_insert(removed_targets, target)
table_remove(target_list, i)
end
end

target_list = serialize(target_list)

local ok, err = shm:set(checker_obj.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

for _, target in ipairs(removed_targets) do
clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname)
checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname)
end
end)

if checker_obj.checks.active.healthy.active and
(checker_obj.checks.active.healthy.last_run +
checker_obj.checks.active.healthy.interval <= cur_time)
Expand Down
34 changes: 17 additions & 17 deletions t/04-report_success.t
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ GET /t
true
true
--- error_log
healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)'
event: target status '(127.0.0.1:2116)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)'
event: target status '(127.0.0.1:2118)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2116)'
event: target status '127.0.0.1(127.0.0.1:2116)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2116)'
event: target status '127.0.0.1(127.0.0.1:2118)' from 'false' to 'true'


=== TEST 2: report_success() recovers TCP active = passive
Expand Down Expand Up @@ -159,14 +159,14 @@ GET /t
true
true
--- error_log
healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)'
event: target status '(127.0.0.1:2116)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)'
event: target status '(127.0.0.1:2118)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2116)'
event: target status '127.0.0.1(127.0.0.1:2116)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2116)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2116)'
event: target status '127.0.0.1(127.0.0.1:2118)' from 'false' to 'true'

=== TEST 3: report_success() is a nop when active.healthy.sucesses == 0
--- http_config eval
Expand Down Expand Up @@ -292,4 +292,4 @@ GET /t
false
--- no_error_log
healthy SUCCESS increment
event: target status '(127.0.0.1:2118)' from 'false' to 'true'
event: target status '127.0.0.1(127.0.0.1:2118)' from 'false' to 'true'
40 changes: 20 additions & 20 deletions t/05-report_failure.t
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ GET /t
false
false
--- error_log
unhealthy HTTP increment (1/3) for '(127.0.0.1:2117)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2117)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2117)'
event: target status '(127.0.0.1:2117)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '(127.0.0.1:2113)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2113)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2113)'
event: target status '(127.0.0.1:2113)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2117)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2117)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2117)'
event: target status '127.0.0.1(127.0.0.1:2117)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2113)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2113)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2113)'
event: target status '127.0.0.1(127.0.0.1:2113)' from 'true' to 'false'


=== TEST 2: report_failure() fails TCP active + passive
Expand Down Expand Up @@ -159,12 +159,12 @@ GET /t
false
false
--- error_log
unhealthy TCP increment (1/2) for '(127.0.0.1:2117)'
unhealthy TCP increment (2/2) for '(127.0.0.1:2117)'
event: target status '(127.0.0.1:2117)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '(127.0.0.1:2113)'
unhealthy TCP increment (2/2) for '(127.0.0.1:2113)'
event: target status '(127.0.0.1:2113)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:2117)'
unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:2117)'
event: target status '127.0.0.1(127.0.0.1:2117)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:2113)'
unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:2113)'
event: target status '127.0.0.1(127.0.0.1:2113)' from 'true' to 'false'


=== TEST 3: report_failure() is a nop when failure counters == 0
Expand Down Expand Up @@ -232,9 +232,9 @@ GET /t
true
true
--- no_error_log
unhealthy TCP increment (1/2) for '(127.0.0.1:2117)'
unhealthy TCP increment (2/2) for '(127.0.0.1:2117)'
event: target status '(127.0.0.1:2117)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '(127.0.0.1:2113)'
unhealthy TCP increment (2/2) for '(127.0.0.1:2113)'
event: target status '(127.0.0.1:2113)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:2117)'
unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:2117)'
event: target status '127.0.0.1(127.0.0.1:2117)' from 'true' to 'false'
unhealthy TCP increment (1/2) for '127.0.0.1(127.0.0.1:2113)'
unhealthy TCP increment (2/2) for '127.0.0.1(127.0.0.1:2113)'
event: target status '127.0.0.1(127.0.0.1:2113)' from 'true' to 'false'
38 changes: 19 additions & 19 deletions t/06-report_http_status.t
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ GET /t
false
false
--- error_log
unhealthy HTTP increment (1/3) for '(127.0.0.1:2119)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2119)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2119)'
event: target status '(127.0.0.1:2119)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '(127.0.0.1:2113)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2113)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2113)'
event: target status '(127.0.0.1:2113)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2119)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2119)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2119)'
event: target status '127.0.0.1(127.0.0.1:2119)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2113)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2113)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2113)'
event: target status '127.0.0.1(127.0.0.1:2113)' from 'true' to 'false'



Expand Down Expand Up @@ -162,16 +162,16 @@ GET /t
true
true
--- error_log
healthy SUCCESS increment (1/4) for '(127.0.0.1:2119)'
healthy SUCCESS increment (2/4) for '(127.0.0.1:2119)'
healthy SUCCESS increment (3/4) for '(127.0.0.1:2119)'
healthy SUCCESS increment (4/4) for '(127.0.0.1:2119)'
event: target status '(127.0.0.1:2119)' from 'false' to 'true'
healthy SUCCESS increment (1/4) for '(127.0.0.1:2113)'
healthy SUCCESS increment (2/4) for '(127.0.0.1:2113)'
healthy SUCCESS increment (3/4) for '(127.0.0.1:2113)'
healthy SUCCESS increment (4/4) for '(127.0.0.1:2113)'
event: target status '(127.0.0.1:2113)' from 'false' to 'true'
healthy SUCCESS increment (1/4) for '127.0.0.1(127.0.0.1:2119)'
healthy SUCCESS increment (2/4) for '127.0.0.1(127.0.0.1:2119)'
healthy SUCCESS increment (3/4) for '127.0.0.1(127.0.0.1:2119)'
healthy SUCCESS increment (4/4) for '127.0.0.1(127.0.0.1:2119)'
event: target status '127.0.0.1(127.0.0.1:2119)' from 'false' to 'true'
healthy SUCCESS increment (1/4) for '127.0.0.1(127.0.0.1:2113)'
healthy SUCCESS increment (2/4) for '127.0.0.1(127.0.0.1:2113)'
healthy SUCCESS increment (3/4) for '127.0.0.1(127.0.0.1:2113)'
healthy SUCCESS increment (4/4) for '127.0.0.1(127.0.0.1:2113)'
event: target status '127.0.0.1(127.0.0.1:2113)' from 'false' to 'true'


=== TEST 3: report_http_status() with success is a nop when passive.healthy.successes == 0
Expand Down Expand Up @@ -427,7 +427,7 @@ GET /t
true
--- no_error_log
unhealthy HTTP increment
event: target status '(127.0.0.1:2119)' from 'true' to 'false'
event: target status '127.0.0.1(127.0.0.1:2119)' from 'true' to 'false'


=== TEST 5: report_http_status() must work in log phase
Expand Down
Loading

0 comments on commit 9a78b3e

Please sign in to comment.