From e3a0e0dd0f8547a363d29cabd10afa422386bb21 Mon Sep 17 00:00:00 2001 From: Vinicius Mignot Date: Mon, 11 Nov 2019 13:36:36 -0300 Subject: [PATCH] feat(balancer) use lua-resty-dns-client healthThreshold attrib (#5206) added unit test healthchecks.threshold param validation --- autodoc/data/admin-api.lua | 45 +++++-- kong/api/routes/upstreams.lua | 20 ++- kong/db/dao/targets.lua | 10 ++ kong/db/schema/entities/upstreams.lua | 8 ++ kong/runloop/balancer.lua | 58 ++++++++- .../01-db/01-schema/09-upstreams_spec.lua | 17 +++ .../11-declarative_config/03-flatten_spec.lua | 6 +- .../10-balancer/01-ring-balancer_spec.lua | 120 +++++++++++++++++- 8 files changed, 262 insertions(+), 22 deletions(-) diff --git a/autodoc/data/admin-api.lua b/autodoc/data/admin-api.lua index 1c0454724174..e85614a9c86c 100644 --- a/autodoc/data/admin-api.lua +++ b/autodoc/data/admin-api.lua @@ -1052,15 +1052,15 @@ return { GET = { title = [[Show Upstream health for node]], description = [[ - Displays the health status for all Targets of a given Upstream, according to - the perspective of a specific Kong node. Note that, being node-specific - information, making this same request to different nodes of the Kong cluster - may produce different results. For example, one specific node of the Kong - cluster may be experiencing network issues, causing it to fail to connect to - some Targets: these Targets will be marked as unhealthy by that node - (directing traffic from this node to other Targets that it can successfully - reach), but healthy to all others Kong nodes (which have no problems using that - Target). + Displays the health status for all Targets of a given Upstream, or for + the whole Upstream, according to the perspective of a specific Kong node. + Note that, being node-specific information, making this same request + to different nodes of the Kong cluster may produce different results. + For example, one specific node of the Kong cluster may be experiencing + network issues, causing it to fail to connect to some Targets: these + Targets will be marked as unhealthy by that node (directing traffic from + this node to other Targets that it can successfully reach), but healthy + to all others Kong nodes (which have no problems using that Target). The `data` field of the response contains an array of Target objects. The health for each Target is returned in its `health` field: @@ -1078,6 +1078,11 @@ return { (circuit breakers) or [manually](#set-target-as-unhealthy), its status is displayed as `UNHEALTHY`. The load balancer is not directing any traffic to this Target via this Upstream. + + When the request query parameter `balancer_health` is set to `1`, the + `data` field of the response refers to the whole Upstream, and its `health` + attribute is defined by the state of all of Upstream's Targets, according + to the field [health checker's threshold][healthchecks.threshold]. ]], endpoint = [[
/upstreams/{name or id}/health/
@@ -1086,6 +1091,11 @@ return { ---:| --- `name or id`
**required** | The unique identifier **or** the name of the Upstream for which to display Target health. ]], + request_query = [[ + Attributes | Description + ---:| --- + `balancer_health`
*optional* | If set to 1, Kong will return the health status of the whole Upstream. + ]], response = [[ ``` HTTP 200 OK @@ -1115,6 +1125,22 @@ return { ] } ``` + + If `balancer_health=1`: + ``` + HTTP 200 OK + ``` + + ```json + { + "data": { + "health": "HEALTHY", + "id": "07131005-ba30-4204-a29f-0927d53257b4" + }, + "next": null, + "node_id": "cbb297c0-14a9-46bc-ad91-1d0ef9b42df9" + } + ``` ]], }, @@ -1153,6 +1179,7 @@ return { ["healthchecks.passive.unhealthy.tcp_failures"] = { description = [[Number of TCP failures in proxied traffic to consider a target unhealthy, as observed by passive health checks.]] }, ["healthchecks.passive.unhealthy.timeouts"] = { description = [[Number of timeouts in proxied traffic to consider a target unhealthy, as observed by passive health checks.]] }, ["healthchecks.passive.unhealthy.http_failures"] = { description = [[Number of HTTP failures in proxied traffic (as defined by `healthchecks.passive.unhealthy.http_statuses`) to consider a target unhealthy, as observed by passive health checks.]] }, + ["healthchecks.threshold"] = { description = [[The minimum percentage of the upstream's targets' weight that must be available for the whole upstream to be considered healthy.]] }, tags = { description = [[ An optional set of strings associated with the Upstream, for grouping and filtering. diff --git a/kong/api/routes/upstreams.lua b/kong/api/routes/upstreams.lua index 50044e8e21ff..c7b53146b4c8 100644 --- a/kong/api/routes/upstreams.lua +++ b/kong/api/routes/upstreams.lua @@ -59,6 +59,21 @@ return { return kong.response.exit(404, { message = "Not found" }) end + local node_id, err = kong.node.get_id() + if err then + kong.log.err("failed to get node id: ", err) + end + + if tostring(self.params.balancer_health) == "1" then + local upstream_pk = db.upstreams.schema:extract_pk_values(upstream) + local balancer_health = db.targets:get_balancer_health(upstream_pk) + return kong.response.exit(200, { + data = balancer_health, + next = null, + node_id = node_id, + }) + end + self.params.targets = db.upstreams.schema:extract_pk_values(upstream) local targets_with_health, _, err_t, offset = endpoints.page_collection(self, db, db.targets.schema, "page_for_upstream_with_health") @@ -71,11 +86,6 @@ return { self.params.upstreams, escape_uri(offset)) or null - local node_id, err = kong.node.get_id() - if err then - kong.log.err("failed getting node id: ", err) - end - return kong.response.exit(200, { data = targets_with_health, offset = offset, diff --git a/kong/db/dao/targets.lua b/kong/db/dao/targets.lua index 32942264a4da..54efef85dde0 100644 --- a/kong/db/dao/targets.lua +++ b/kong/db/dao/targets.lua @@ -364,4 +364,14 @@ function _TARGETS:post_health(upstream_pk, target, address, is_healthy) end +function _TARGETS:get_balancer_health(upstream_pk) + local health_info, err = balancer.get_balancer_health(upstream_pk.id) + if err then + ngx.log(ngx.ERR, "failed getting upstream health: ", err) + end + + return health_info +end + + return _TARGETS diff --git a/kong/db/schema/entities/upstreams.lua b/kong/db/schema/entities/upstreams.lua index 04b4ac5fdba7..d7e2c6376080 100644 --- a/kong/db/schema/entities/upstreams.lua +++ b/kong/db/schema/entities/upstreams.lua @@ -63,6 +63,13 @@ local check_verify_certificate = Schema.define { } +local health_threshold = Schema.define { + type = "number", + default = 0, + between = { 0, 100 }, +} + + local NO_DEFAULT = {} @@ -145,6 +152,7 @@ end local healthchecks_fields, healthchecks_defaults = gen_fields(healthchecks_config) +healthchecks_fields[#healthchecks_fields+1] = { ["threshold"] = health_threshold } local r = { diff --git a/kong/runloop/balancer.lua b/kong/runloop/balancer.lua index df393e9431b8..5926bc41e8af 100644 --- a/kong/runloop/balancer.lua +++ b/kong/runloop/balancer.lua @@ -415,10 +415,13 @@ do end creating[upstream.id] = true + local health_threshold = upstream.healthchecks and + upstream.healthchecks.threshold or nil local balancer, err = balancer_types[upstream.algorithm].new({ wheelSize = upstream.slots, -- will be ignored by least-connections dns = dns_client, + healthThreshold = health_threshold, }) if not balancer then @@ -984,6 +987,19 @@ local function unsubscribe_from_healthcheck_events(callback) end +local function is_upstream_using_healthcheck(upstream) + if upstream ~= nil then + return upstream.healthchecks.active.healthy.interval ~= 0 + or upstream.healthchecks.active.unhealthy.interval ~= 0 + or upstream.healthchecks.passive.unhealthy.tcp_failures ~= 0 + or upstream.healthchecks.passive.unhealthy.timeouts ~= 0 + or upstream.healthchecks.passive.unhealthy.http_failures ~= 0 + end + + return false +end + + -------------------------------------------------------------------------------- -- Get healthcheck information for an upstream. -- @param upstream_id the id of the upstream. @@ -998,11 +1014,7 @@ local function get_upstream_health(upstream_id) return nil, "upstream not found" end - local using_hc = upstream.healthchecks.active.healthy.interval ~= 0 - or upstream.healthchecks.active.unhealthy.interval ~= 0 - or upstream.healthchecks.passive.unhealthy.tcp_failures ~= 0 - or upstream.healthchecks.passive.unhealthy.timeouts ~= 0 - or upstream.healthchecks.passive.unhealthy.http_failures ~= 0 + local using_hc = is_upstream_using_healthcheck(upstream) local balancer = balancers[upstream_id] if not balancer then @@ -1036,6 +1048,41 @@ local function get_upstream_health(upstream_id) end +-------------------------------------------------------------------------------- +-- Get healthcheck information for a balancer. +-- @param upstream_id the id of the upstream. +-- @return table with balancer health info +local function get_balancer_health(upstream_id) + + local upstream = get_upstream_by_id(upstream_id) + if not upstream then + return nil, "upstream not found" + end + + local balancer = balancers[upstream_id] + if not balancer then + return nil, "balancer not found" + end + + local healthchecker + local health = "HEALTHCHECKS_OFF" + if is_upstream_using_healthcheck(upstream) then + healthchecker = healthcheckers[balancer] + if not healthchecker then + return nil, "healthchecker not found" + end + + local balancer_status = balancer:getStatus() + health = balancer_status.healthy and "HEALTHY" or "UNHEALTHY" + end + + return { + health = health, + id = upstream_id, + } +end + + -------------------------------------------------------------------------------- -- for unit-testing purposes only local function _get_healthchecker(balancer) @@ -1062,6 +1109,7 @@ return { unsubscribe_from_healthcheck_events = unsubscribe_from_healthcheck_events, get_upstream_health = get_upstream_health, get_upstream_by_id = get_upstream_by_id, + get_balancer_health = get_balancer_health, -- ones below are exported for test purposes only _create_balancer = create_balancer, diff --git a/spec/01-unit/01-db/01-schema/09-upstreams_spec.lua b/spec/01-unit/01-db/01-schema/09-upstreams_spec.lua index 3d04030d1d08..cb5e16915468 100644 --- a/spec/01-unit/01-db/01-schema/09-upstreams_spec.lua +++ b/spec/01-unit/01-db/01-schema/09-upstreams_spec.lua @@ -275,9 +275,11 @@ describe("load upstreams", function() local status_code = "value should be between 100 and 999" local integer = "expected an integer" local boolean = "expected a boolean" + local number = "expected a number" local invalid_host = "invalid value: " local invalid_host_port = "must not have a port" local invalid_ip = "must not be an IP" + local threshold = "value should be between 0 and 100" local tests = { {{ active = { timeout = -1 }}, seconds }, {{ active = { timeout = 1e+42 }}, seconds }, @@ -309,6 +311,10 @@ describe("load upstreams", function() {{ active = { healthy = { http_statuses = { 1000 }}}}, status_code }, {{ active = { healthy = { http_statuses = { 111.314 }}}}, integer }, {{ active = { healthy = { successes = 0.5 }}}, integer }, + {{ active = { unhealthy = { timeouts = 1 }}, threshold = -1}, threshold }, + {{ active = { unhealthy = { timeouts = 1 }}, threshold = 101}, threshold }, + {{ active = { unhealthy = { timeouts = 1 }}, threshold = "50"}, number }, + {{ active = { unhealthy = { timeouts = 1 }}, threshold = true}, number }, --{{ active = { healthy = { successes = 0 }}}, "must be an integer" }, {{ active = { healthy = { successes = -1 }}}, zero_integer }, {{ active = { unhealthy = { interval = -1 }}}, seconds }, @@ -348,6 +354,11 @@ describe("load upstreams", function() {{ passive = { unhealthy = { http_failures = 0.5 }}}, integer }, --{{ passive = { unhealthy = { http_failures = 0 }}}, integer }, {{ passive = { unhealthy = { http_failures = -1 }}}, zero_integer }, + {{ passive = { unhealthy = { timeouts = 1 }}, threshold = -1}, threshold }, + {{ passive = { unhealthy = { timeouts = 1 }}, threshold = 101}, threshold }, + {{ passive = { unhealthy = { timeouts = 1 }}, threshold = "50"}, number }, + {{ passive = { unhealthy = { timeouts = 1 }}, threshold = true}, number }, + --]] } @@ -386,12 +397,18 @@ describe("load upstreams", function() { active = { unhealthy = { tcp_failures = 3 }}}, { active = { unhealthy = { timeouts = 9 }}}, { active = { unhealthy = { http_failures = 2 }}}, + { active = { unhealthy = { http_failures = 2 }}, threshold = 0}, + { active = { unhealthy = { http_failures = 2 }}, threshold = 50.50}, + { active = { unhealthy = { http_failures = 2 }}, threshold = 100}, { passive = { healthy = { http_statuses = { 200, 201 } }}}, { passive = { healthy = { successes = 2 }}}, { passive = { unhealthy = { http_statuses = { 400, 500 } }}}, { passive = { unhealthy = { tcp_failures = 8 }}}, { passive = { unhealthy = { timeouts = 1 }}}, { passive = { unhealthy = { http_failures = 2 }}}, + { passive = { unhealthy = { http_failures = 2 }}, threshold = 0}, + { passive = { unhealthy = { http_failures = 2 }}, threshold = 50.50}, + { passive = { unhealthy = { http_failures = 2 }}, threshold = 100}, } for _, test in ipairs(tests) do local entity = { diff --git a/spec/01-unit/01-db/01-schema/11-declarative_config/03-flatten_spec.lua b/spec/01-unit/01-db/01-schema/11-declarative_config/03-flatten_spec.lua index 5ea0cac4ec05..2afd7f647ff8 100644 --- a/spec/01-unit/01-db/01-schema/11-declarative_config/03-flatten_spec.lua +++ b/spec/01-unit/01-db/01-schema/11-declarative_config/03-flatten_spec.lua @@ -1359,7 +1359,8 @@ describe("declarative config: flatten", function() tcp_failures = 0, timeouts = 0 } - } + }, + threshold = 0 }, host_header = null, id = "UUID", @@ -1408,7 +1409,8 @@ describe("declarative config: flatten", function() tcp_failures = 0, timeouts = 0 } - } + }, + threshold = 0 }, host_header = null, id = "UUID", diff --git a/spec/02-integration/05-proxy/10-balancer/01-ring-balancer_spec.lua b/spec/02-integration/05-proxy/10-balancer/01-ring-balancer_spec.lua index 2011a28a980d..fac2767a3128 100644 --- a/spec/02-integration/05-proxy/10-balancer/01-ring-balancer_spec.lua +++ b/spec/02-integration/05-proxy/10-balancer/01-ring-balancer_spec.lua @@ -190,6 +190,7 @@ local add_upstream local patch_upstream local get_upstream local get_upstream_health +local get_balancer_health local post_target_address_health local get_router_version local add_target @@ -260,6 +261,14 @@ do end end + get_balancer_health = function(upstream_id, forced_port) + local path = "/upstreams/" .. upstream_id .."/health?balancer_health=1" + local status, body = api_send("GET", path, nil, forced_port) + if status == 200 then + return body + end + end + post_target_address_health = function(upstream_id, target_id, address, mode, forced_port) local path = "/upstreams/" .. upstream_id .. "/targets/" .. target_id .. "/" .. address .. "/" .. mode return api_send("POST", path, {}, forced_port) @@ -1072,7 +1081,8 @@ for _, strategy in helpers.each_strategy() do tcp_failures = 0, timeouts = 0 } - } + }, + threshold = 0 } local upstream_data = get_upstream(upstream_id) @@ -1319,6 +1329,114 @@ for _, strategy in helpers.each_strategy() do end end) + it("threshold for health checks", function() + local dns_mock_filename = helpers.test_conf.prefix .. "/dns_mock_records.lua" + finally(function() + os.remove(dns_mock_filename) + end) + + local fixtures = { + dns_mock = helpers.dns_mock.new() + } + fixtures.dns_mock:A { + name = "health-threshold.test", + address = "127.0.0.1", + } + fixtures.dns_mock:A { + name = "health-threshold.test", + address = "127.0.0.2", + } + fixtures.dns_mock:A { + name = "health-threshold.test", + address = "127.0.0.3", + } + fixtures.dns_mock:A { + name = "health-threshold.test", + address = "127.0.0.4", + } + + -- restart Kong + begin_testcase_setup_update(strategy, bp) + helpers.restart_kong({ + database = strategy, + nginx_conf = "spec/fixtures/custom_nginx.template", + lua_ssl_trusted_certificate = "../spec/fixtures/kong_spec.crt", + db_update_frequency = 0.1, + stream_listen = "0.0.0.0:9100", + plugins = "bundled,fail-once-auth", + }, nil, fixtures) + end_testcase_setup(strategy, bp) + ngx.sleep(1) + + local health_threshold = { 0, 25, 75, 99, 100 } + for i = 1, 5 do + -- configure healthchecks + begin_testcase_setup(strategy, bp) + local upstream_name, upstream_id = add_upstream(bp, { + healthchecks = healthchecks_config { + passive = { + unhealthy = { + tcp_failures = 1, + } + }, + threshold = health_threshold[i], + } + }) + + add_target(bp, upstream_id, "health-threshold.test", 80, { weight = 25 }) + end_testcase_setup(strategy, bp) + + -- 100% healthy + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.1:80", "healthy") + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.2:80", "healthy") + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.3:80", "healthy") + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.4:80", "healthy") + + local health = get_balancer_health(upstream_name) + assert.is.table(health) + assert.is.table(health.data) + + if health_threshold[i] < 100 then + assert.equals("HEALTHY", health.data.health) + else + assert.equals("UNHEALTHY", health.data.health) + end + + -- 75% healthy + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.1:80", "unhealthy") + health = get_balancer_health(upstream_name) + if health_threshold[i] < 75 then + assert.equals("HEALTHY", health.data.health) + else + assert.equals("UNHEALTHY", health.data.health) + end + + -- 50% healthy + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.2:80", "unhealthy") + health = get_balancer_health(upstream_name) + if health_threshold[i] < 50 then + assert.equals("HEALTHY", health.data.health) + else + assert.equals("UNHEALTHY", health.data.health) + end + + -- 25% healthy + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.3:80", "unhealthy") + health = get_balancer_health(upstream_name) + if health_threshold[i] < 25 then + assert.equals("HEALTHY", health.data.health) + else + assert.equals("UNHEALTHY", health.data.health) + end + + -- 0% healthy + post_target_address_health(upstream_id, "health-threshold.test:80", "127.0.0.4:80", "unhealthy") + health = get_balancer_health(upstream_name) + assert.equals("UNHEALTHY", health.data.health) + + end + end) + stream_it("#stream and http modules do not duplicate active health checks", function() local port1 = gen_port()