From ecaa2030914e9777c2acd5a7b8baf8d8de663490 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 9 May 2019 13:49:48 +0300 Subject: [PATCH 1/4] Fix custom metric checks - escape the prom query before encoding it --- pkg/metrics/observer.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/metrics/observer.go b/pkg/metrics/observer.go index ddb871199..ef0a5cfd4 100644 --- a/pkg/metrics/observer.go +++ b/pkg/metrics/observer.go @@ -99,7 +99,9 @@ func (c *Observer) GetScalar(query string) (float64, error) { query = strings.Replace(query, " ", "", -1) var value *float64 - result, err := c.queryMetric(query) + + querySt := url.QueryEscape(query) + result, err := c.queryMetric(querySt) if err != nil { return 0, err } From 121a65fad0354afe65ef0792327869f2b028c730 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 9 May 2019 13:50:47 +0300 Subject: [PATCH 2/4] Fix nginx promql namespace selector --- pkg/metrics/nginx.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/metrics/nginx.go b/pkg/metrics/nginx.go index 7c6eb56e7..bdd1c8f3f 100644 --- a/pkg/metrics/nginx.go +++ b/pkg/metrics/nginx.go @@ -9,13 +9,13 @@ import ( const nginxSuccessRateQuery = ` sum(rate( -nginx_ingress_controller_requests{kubernetes_namespace="{{ .Namespace }}", +nginx_ingress_controller_requests{namespace="{{ .Namespace }}", ingress="{{ .Name }}", status!~"5.*"} [{{ .Interval }}])) / sum(rate( -nginx_ingress_controller_requests{kubernetes_namespace="{{ .Namespace }}", +nginx_ingress_controller_requests{namespace="{{ .Namespace }}", ingress="{{ .Name }}"} [{{ .Interval }}])) * 100 @@ -68,10 +68,10 @@ func (c *Observer) GetNginxSuccessRate(name string, namespace string, metric str const nginxRequestDurationQuery = ` sum(rate( -nginx_ingress_controller_ingress_upstream_latency_seconds_sum{kubernetes_namespace="{{ .Namespace }}", +nginx_ingress_controller_ingress_upstream_latency_seconds_sum{namespace="{{ .Namespace }}", ingress="{{ .Name }}"}[{{ .Interval }}])) / -sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{kubernetes_namespace="{{ .Namespace }}", +sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{namespace="{{ .Namespace }}", ingress="{{ .Name }}"}[{{ .Interval }}])) * 1000 ` From 8d0b54e0595e05982c4736e516ed208110b5aed5 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 9 May 2019 13:51:37 +0300 Subject: [PATCH 3/4] Add custom metrics to nginx docs --- artifacts/nginx/canary.yaml | 17 +++-- .../usage/nginx-progressive-delivery.md | 68 ++++++++++++++++++- 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/artifacts/nginx/canary.yaml b/artifacts/nginx/canary.yaml index bca0a7098..6186889b6 100644 --- a/artifacts/nginx/canary.yaml +++ b/artifacts/nginx/canary.yaml @@ -43,11 +43,20 @@ spec: # percentage (0-100) threshold: 99 interval: 1m - - name: request-duration - # maximum avg req duration - # milliseconds - threshold: 500 + - name: "latency" + threshold: 0.5 interval: 1m + query: | + histogram_quantile(0.99, + sum( + rate( + http_request_duration_seconds_bucket{ + kubernetes_namespace="test", + kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[1m] + ) + ) by (le) + ) # external checks (optional) webhooks: - name: load-test diff --git a/docs/gitbook/usage/nginx-progressive-delivery.md b/docs/gitbook/usage/nginx-progressive-delivery.md index 2fa7a5493..95260a480 100644 --- a/docs/gitbook/usage/nginx-progressive-delivery.md +++ b/docs/gitbook/usage/nginx-progressive-delivery.md @@ -14,7 +14,9 @@ Install NGINX with Helm: helm upgrade -i nginx-ingress stable/nginx-ingress \ --namespace ingress-nginx \ --set controller.stats.enabled=true \ ---set controller.metrics.enabled=true +--set controller.metrics.enabled=true \ +--set controller.podAnnotations."prometheus\.io/scrape"=true \ +--set controller.podAnnotations."prometheus\.io/port"=10254 ``` Install Flagger and the Prometheus add-on in the same namespace as NGINX: @@ -276,6 +278,70 @@ Events: Warning Synced 1m flagger Canary failed! Scaling down podinfo.test ``` +### Custom metrics + +The canary analysis can be extended with Prometheus queries. + +The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration +histogram to validate the canary. + +Edit the canary analysis and add the following metric: + +```yaml + canaryAnalysis: + metrics: + - name: "latency" + threshold: 0.5 + interval: 1m + query: | + histogram_quantile(0.99, + sum( + rate( + http_request_duration_seconds_bucket{ + kubernetes_namespace="test", + kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" + }[1m] + ) + ) by (le) + ) +``` + +The threshold is set to 500ms so if the average request duration in the last minute +goes over half a second then the analysis will fail and the canary will not be promoted. + +Trigger a canary deployment by updating the container image: + +```bash +kubectl -n test set image deployment/podinfo \ +podinfod=quay.io/stefanprodan/podinfo:1.4.3 +``` + +Generate high response latency: + +```bash +watch curl http://app.exmaple.com/delay/2 +``` + +Watch Flagger logs: + +``` +kubectl -n nginx-ingress logs deployment/flagger -f | jq .msg + +Starting canary deployment for podinfo.test +Advance podinfo.test canary weight 5 +Advance podinfo.test canary weight 10 +Advance podinfo.test canary weight 15 +Halt podinfo.test advancement latency 1.20 > 0.5 +Halt podinfo.test advancement latency 1.45 > 0.5 +Halt podinfo.test advancement latency 1.60 > 0.5 +Halt podinfo.test advancement latency 1.69 > 0.5 +Halt podinfo.test advancement latency 1.70 > 0.5 +Rolling back podinfo.test failed checks threshold reached 5 +Canary failed! Scaling down podinfo.test +``` + +If you have Slack configured, Flagger will send a notification with the reason why the canary failed. + ### A/B Testing Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. From 2ff695ecfe471d1932c7b6434ae52b53e0007ff2 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 9 May 2019 14:00:15 +0300 Subject: [PATCH 4/4] Fix nginx metrics tests --- pkg/metrics/nginx_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metrics/nginx_test.go b/pkg/metrics/nginx_test.go index e2a93c5f1..00e9ef903 100644 --- a/pkg/metrics/nginx_test.go +++ b/pkg/metrics/nginx_test.go @@ -20,7 +20,7 @@ func Test_NginxSuccessRateQueryRender(t *testing.T) { t.Fatal(err) } - expected := `sum(rate(nginx_ingress_controller_requests{kubernetes_namespace="nginx",ingress="podinfo",status!~"5.*"}[1m])) / sum(rate(nginx_ingress_controller_requests{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) * 100` + expected := `sum(rate(nginx_ingress_controller_requests{namespace="nginx",ingress="podinfo",status!~"5.*"}[1m])) / sum(rate(nginx_ingress_controller_requests{namespace="nginx",ingress="podinfo"}[1m])) * 100` if query != expected { t.Errorf("\nGot %s \nWanted %s", query, expected) @@ -43,7 +43,7 @@ func Test_NginxRequestDurationQueryRender(t *testing.T) { t.Fatal(err) } - expected := `sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) /sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) * 1000` + expected := `sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{namespace="nginx",ingress="podinfo"}[1m])) /sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{namespace="nginx",ingress="podinfo"}[1m])) * 1000` if query != expected { t.Errorf("\nGot %s \nWanted %s", query, expected)