diff --git a/README.md b/README.md index 95b16c42f..7f6be2a4d 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Flagger documentation can be found at [docs.flagger.app](https://docs.flagger.ap * [Istio canary deployments](https://docs.flagger.app/usage/progressive-delivery) * [Linkerd canary deployments](https://docs.flagger.app/usage/linkerd-progressive-delivery) * [App Mesh canary deployments](https://docs.flagger.app/usage/appmesh-progressive-delivery) + * [Crossover canary deployments](https://docs.flagger.app/usage/crossover-progressive-delivery) * [NGINX ingress controller canary deployments](https://docs.flagger.app/usage/nginx-progressive-delivery) * [Gloo ingress controller canary deployments](https://docs.flagger.app/usage/gloo-progressive-delivery) * [Blue/Green deployments](https://docs.flagger.app/usage/blue-green) diff --git a/charts/grafana/dashboards/envoy.json b/charts/grafana/dashboards/envoy.json new file mode 100644 index 000000000..46b27a66b --- /dev/null +++ b/charts/grafana/dashboards/envoy.json @@ -0,0 +1,1226 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "iteration": 1553160305729, + "links": [], + "panels": [ + { + "content": "
\nRED: $target.$namespace\n
", + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 89, + "links": [], + "mode": "html", + "title": "", + "transparent": true, + "type": "text" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "ops", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 3 + }, + "id": 90, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "round(sum(rate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-primary\"}[30s])), 0.001)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": "", + "title": "Primary: Incoming Request Volume", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": null, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 80, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 3 + }, + "id": 98, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-primary\",envoy_response_code!~\"5.*\"}[30s])) / sum(irate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-primary\"}[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": "95, 99, 99.5", + "title": "Incoming Success Rate", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "ops", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 3 + }, + "id": 97, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(40, 224, 65, 0.18)", + "full": true, + "lineColor": "#7eb26d", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "round(sum(rate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-canary\"}[30s])), 0.001)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": "", + "title": "Canary: Incoming Request Volume", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": null, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 80, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 3 + }, + "id": 99, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(40, 224, 65, 0.18)", + "full": true, + "lineColor": "#7eb26d", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-canary\",envoy_response_code!~\"5.*\"}[30s])) / sum(irate(envoy_cluster_upstream_rq{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-canary\"}[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": "95, 99, 99.5", + "title": "Incoming Success Rate", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 96, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(rate(envoy_cluster_upstream_cx_rx_bytes_total{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-primary\"}[30s])))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "traffic", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Primary: Incoming Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 91, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(rate(envoy_cluster_upstream_cx_rx_bytes_total{kubernetes_namespace=~\"$namespace\",envoy_cluster_name=~\"$target-canary\"}[30s])))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "traffic", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Canary: Incoming Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "content": "
\nUSE: $target.$namespace\n
", + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 101, + "links": [], + "mode": "html", + "title": "", + "transparent": true, + "type": "text" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 100, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{cpu=\"total\",namespace=\"$namespace\",pod_name=~\"$target-primary.*\", container_name!~\"POD|istio-proxy\"}[1m])) by (pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Primary: CPU Usage by Pod", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "CPU seconds / second", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 102, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{cpu=\"total\",namespace=\"$namespace\",pod_name=~\"$target.*\", pod_name!~\"$target-primary.*\", container_name!~\"POD|istio-proxy\"}[1m])) by (pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Canary: CPU Usage by Pod", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "CPU seconds / second", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 103, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\",pod_name=~\"$target-primary.*\", container_name!~\"POD|istio-proxy\"}) by (pod_name)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Primary: Memory Usage by Pod", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 104, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\",pod_name=~\"$target.*\", pod_name!~\"$target-primary.*\", container_name!~\"POD|istio-proxy\"}) by (pod_name)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Canary: Memory Usage by Pod", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 105, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "received", + "color": "#f9d9f9" + }, + { + "alias": "transmited", + "color": "#f29191" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate (container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$target-primary.*\"}[1m])) ", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received", + "refId": "A" + }, + { + "expr": "-sum (rate (container_network_transmit_bytes_total{namespace=\"$namespace\",pod_name=~\"$target-primary.*\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "transmited", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Primary: Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 106, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "received", + "color": "#f9d9f9" + }, + { + "alias": "transmited", + "color": "#f29191" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate (container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$target.*\",pod_name!~\"$target-primary.*\"}[1m])) ", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received", + "refId": "A" + }, + { + "expr": "-sum (rate (container_network_transmit_bytes_total{namespace=\"$namespace\",pod_name=~\"$target.*\",pod_name!~\"$target-primary.*\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "transmited", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Canary: Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": null, + "datasource": "prometheus", + "definition": "query_result(sum(envoy_cluster_upstream_rq) by (kubernetes_namespace))", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "query_result(sum(envoy_cluster_upstream_rq) by (kubernetes_namespace))", + "refresh": 1, + "regex": "/.*_namespace=\"([^\"]*).*/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": null, + "datasource": "prometheus", + "definition": "query_result(sum(envoy_cluster_upstream_rq{kubernetes_namespace=\"$namespace\",envoy_cluster_name=~\".*-primary\"}) by (envoy_cluster_name))", + "hide": 0, + "includeAll": false, + "label": "Target", + "multi": false, + "name": "target", + "options": [], + "query": "query_result(sum(envoy_cluster_upstream_rq{kubernetes_namespace=\"$namespace\",envoy_cluster_name=~\".*-primary\"}) by (envoy_cluster_name))", + "refresh": 1, + "regex": "/.*envoy_cluster_name=\"(.*)-primary\"/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Envoy Canary", + "uid": "flagger-envoy", + "version": 6 +} \ No newline at end of file diff --git a/cmd/flagger/main.go b/cmd/flagger/main.go index 8548de560..7d9df9138 100644 --- a/cmd/flagger/main.go +++ b/cmd/flagger/main.go @@ -161,7 +161,7 @@ func main() { logger.Infof("Watching namespace %s", namespace) } - observerFactory, err := metrics.NewFactory(metricsServer, meshProvider, 5*time.Second) + observerFactory, err := metrics.NewFactory(metricsServer, 5*time.Second) if err != nil { logger.Fatalf("Error building prometheus client: %s", err.Error()) } diff --git a/docs/gitbook/usage/crossover-progressive-delivery.md b/docs/gitbook/usage/crossover-progressive-delivery.md new file mode 100644 index 000000000..1559f0fc9 --- /dev/null +++ b/docs/gitbook/usage/crossover-progressive-delivery.md @@ -0,0 +1,319 @@ +# Envoy/Crossover Canary Deployments + +This guide shows you how to use Envoy, [Crossover](https://github.com/mumoshu/crossover) and Flagger to automate canary deployments. + +Crossover is a minimal Envoy xDS implementation supports [Service Mesh Interface](https://smi-spec.io/). + +### Prerequisites + +Flagger requires a Kubernetes cluster **v1.11** or newer and Envoy paired with [Crossover](https://github.com/mumoshu/crossover) sidecar. + +Create a test namespace: + +```bash +kubectl create ns test +``` + +Install Envoy along with the Crossover sidecar with Helm: + +```bash +helm repo add crossover https://mumoshu.github.io/crossover + +helm upgrade --install envoy crossover/envoy \ + --namespace test \ + -f <(cat < 0.5s +Halt podinfo.test advancement request duration 1.45s > 0.5s +Rolling back podinfo.test failed checks threshold reached 5 +Canary failed! Scaling down podinfo.test +``` + +If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, +or if the analysis reached the maximum number of failed checks: + +![Flagger Slack Notifications](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/screens/slack-canary-failed.png) diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 8554c4366..6945ae2ad 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -73,7 +73,7 @@ func SetupMocks(c *flaggerv1.Canary) Mocks { rf := router.NewFactory(nil, kubeClient, flaggerClient, "annotationsPrefix", logger, flaggerClient) // init observer - observerFactory, _ := metrics.NewFactory("fake", "istio", 5*time.Second) + observerFactory, _ := metrics.NewFactory("fake", 5*time.Second) // init canary factory configTracker := canary.ConfigTracker{ diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 25cdabb6c..307832654 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -13,6 +13,10 @@ import ( "github.com/weaveworks/flagger/pkg/router" ) +const ( + MetricsProviderServiceSuffix = ":service" +) + // scheduleCanaries synchronises the canary map with the jobs map, // for new canaries new jobs are created and started // for the removed canaries the jobs are stopped and deleted @@ -743,15 +747,27 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { } // override the global provider if one is specified in the canary spec - metricsProvider := c.meshProvider + var metricsProvider string + // set the metrics provider to Crossover Prometheus when Crossover is the mesh provider + // For example, `crossover` metrics provider should be used for `smi:crossover` mesh provider + if strings.Contains(c.meshProvider, "crossover") { + metricsProvider = "crossover" + } else { + metricsProvider = c.meshProvider + } + if r.Spec.Provider != "" { metricsProvider = r.Spec.Provider - // set the metrics server to Linkerd Prometheus when Linkerd is the default mesh provider + // set the metrics provider to Linkerd Prometheus when Linkerd is the default mesh provider if strings.Contains(c.meshProvider, "linkerd") { metricsProvider = "linkerd" } } + // set the metrics provider to query Prometheus for the canary Kubernetes service if the canary target is Service + if r.Spec.TargetRef.Kind == "Service" { + metricsProvider = metricsProvider + MetricsProviderServiceSuffix + } // create observer based on the mesh provider observerFactory := c.observerFactory @@ -761,7 +777,7 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { if r.Spec.MetricsServer != "" { metricsServer = r.Spec.MetricsServer var err error - observerFactory, err = metrics.NewFactory(metricsServer, metricsProvider, 5*time.Second) + observerFactory, err = metrics.NewFactory(metricsServer, 5*time.Second) if err != nil { c.recordEventErrorf(r, "Error building Prometheus client for %s %v", r.Spec.MetricsServer, err) return false @@ -779,8 +795,8 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { val, err := observer.GetRequestSuccessRate(r.Spec.TargetRef.Name, r.Namespace, metric.Interval) if err != nil { if strings.Contains(err.Error(), "no values found") { - c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", - metric.Name, r.Spec.TargetRef.Name, r.Namespace) + c.recordEventWarningf(r, "Halt advancement no values found for %s metric %s probably %s.%s is not receiving traffic", + metricsProvider, metric.Name, r.Spec.TargetRef.Name, r.Namespace) } else { c.recordEventErrorf(r, "Metrics server %s query failed: %v", metricsServer, err) } @@ -799,8 +815,8 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { val, err := observer.GetRequestDuration(r.Spec.TargetRef.Name, r.Namespace, metric.Interval) if err != nil { if strings.Contains(err.Error(), "no values found") { - c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", - metric.Name, r.Spec.TargetRef.Name, r.Namespace) + c.recordEventWarningf(r, "Halt advancement no values found for %s metric %s probably %s.%s is not receiving traffic", + metricsProvider, metric.Name, r.Spec.TargetRef.Name, r.Namespace) } else { c.recordEventErrorf(r, "Metrics server %s query failed: %v", metricsServer, err) } diff --git a/pkg/metrics/envoy.go b/pkg/metrics/appmesh.go similarity index 76% rename from pkg/metrics/envoy.go rename to pkg/metrics/appmesh.go index ce88b5407..f922c55a7 100644 --- a/pkg/metrics/envoy.go +++ b/pkg/metrics/appmesh.go @@ -4,7 +4,7 @@ import ( "time" ) -var envoyQueries = map[string]string{ +var appMeshQueries = map[string]string{ "request-success-rate": ` sum( rate( @@ -39,12 +39,12 @@ var envoyQueries = map[string]string{ )`, } -type EnvoyObserver struct { +type AppMeshObserver struct { client *PrometheusClient } -func (ob *EnvoyObserver) GetRequestSuccessRate(name string, namespace string, interval string) (float64, error) { - query, err := ob.client.RenderQuery(name, namespace, interval, envoyQueries["request-success-rate"]) +func (ob *AppMeshObserver) GetRequestSuccessRate(name string, namespace string, interval string) (float64, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, appMeshQueries["request-success-rate"]) if err != nil { return 0, err } @@ -57,8 +57,8 @@ func (ob *EnvoyObserver) GetRequestSuccessRate(name string, namespace string, in return value, nil } -func (ob *EnvoyObserver) GetRequestDuration(name string, namespace string, interval string) (time.Duration, error) { - query, err := ob.client.RenderQuery(name, namespace, interval, envoyQueries["request-duration"]) +func (ob *AppMeshObserver) GetRequestDuration(name string, namespace string, interval string) (time.Duration, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, appMeshQueries["request-duration"]) if err != nil { return 0, err } diff --git a/pkg/metrics/envoy_test.go b/pkg/metrics/appmesh_test.go similarity index 91% rename from pkg/metrics/envoy_test.go rename to pkg/metrics/appmesh_test.go index 442b59e7e..471be5c4f 100644 --- a/pkg/metrics/envoy_test.go +++ b/pkg/metrics/appmesh_test.go @@ -7,7 +7,7 @@ import ( "time" ) -func TestEnvoyObserver_GetRequestSuccessRate(t *testing.T) { +func TestAppMeshObserver_GetRequestSuccessRate(t *testing.T) { expected := ` sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)", envoy_response_code!~"5.*" }[1m] ) ) / sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" }[1m] ) ) * 100` ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -26,7 +26,7 @@ func TestEnvoyObserver_GetRequestSuccessRate(t *testing.T) { t.Fatal(err) } - observer := &EnvoyObserver{ + observer := &AppMeshObserver{ client: client, } @@ -40,7 +40,7 @@ func TestEnvoyObserver_GetRequestSuccessRate(t *testing.T) { } } -func TestEnvoyObserver_GetRequestDuration(t *testing.T) { +func TestAppMeshObserver_GetRequestDuration(t *testing.T) { expected := ` histogram_quantile( 0.99, sum( rate( envoy_cluster_upstream_rq_time_bucket{ kubernetes_namespace="default", kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)" }[1m] ) ) by (le) )` ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -59,7 +59,7 @@ func TestEnvoyObserver_GetRequestDuration(t *testing.T) { t.Fatal(err) } - observer := &EnvoyObserver{ + observer := &AppMeshObserver{ client: client, } diff --git a/pkg/metrics/crossover.go b/pkg/metrics/crossover.go new file mode 100644 index 000000000..54a5e2908 --- /dev/null +++ b/pkg/metrics/crossover.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "time" +) + +var crossoverQueries = map[string]string{ + "request-success-rate": ` + sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name=~"{{ .Name }}-canary", + envoy_response_code!~"5.*" + }[{{ .Interval }}] + ) + ) + / + sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name=~"{{ .Name }}-canary" + }[{{ .Interval }}] + ) + ) + * 100`, + "request-duration": ` + histogram_quantile( + 0.99, + sum( + rate( + envoy_cluster_upstream_rq_time_bucket{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name=~"{{ .Name }}-canary" + }[{{ .Interval }}] + ) + ) by (le) + )`, +} + +type CrossoverObserver struct { + client *PrometheusClient +} + +func (ob *CrossoverObserver) GetRequestSuccessRate(name string, namespace string, interval string) (float64, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, crossoverQueries["request-success-rate"]) + if err != nil { + return 0, err + } + + value, err := ob.client.RunQuery(query) + if err != nil { + return 0, err + } + + return value, nil +} + +func (ob *CrossoverObserver) GetRequestDuration(name string, namespace string, interval string) (time.Duration, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, crossoverQueries["request-duration"]) + if err != nil { + return 0, err + } + + value, err := ob.client.RunQuery(query) + if err != nil { + return 0, err + } + + ms := time.Duration(int64(value)) * time.Millisecond + return ms, nil +} diff --git a/pkg/metrics/crossover_service.go b/pkg/metrics/crossover_service.go new file mode 100644 index 000000000..bde9f0a18 --- /dev/null +++ b/pkg/metrics/crossover_service.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "time" +) + +var crossoverServiceQueries = map[string]string{ + "request-success-rate": ` + sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name="{{ .Name }}-canary", + envoy_response_code!~"5.*" + }[{{ .Interval }}] + ) + ) + / + sum( + rate( + envoy_cluster_upstream_rq{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name="{{ .Name }}-canary" + }[{{ .Interval }}] + ) + ) + * 100`, + "request-duration": ` + histogram_quantile( + 0.99, + sum( + rate( + envoy_cluster_upstream_rq_time_bucket{ + kubernetes_namespace="{{ .Namespace }}", + envoy_cluster_name="{{ .Name }}-canary" + }[{{ .Interval }}] + ) + ) by (le) + )`, +} + +type CrossoverServiceObserver struct { + client *PrometheusClient +} + +func (ob *CrossoverServiceObserver) GetRequestSuccessRate(name string, namespace string, interval string) (float64, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, crossoverServiceQueries["request-success-rate"]) + if err != nil { + return 0, err + } + + value, err := ob.client.RunQuery(query) + if err != nil { + return 0, err + } + + return value, nil +} + +func (ob *CrossoverServiceObserver) GetRequestDuration(name string, namespace string, interval string) (time.Duration, error) { + query, err := ob.client.RenderQuery(name, namespace, interval, crossoverServiceQueries["request-duration"]) + if err != nil { + return 0, err + } + + value, err := ob.client.RunQuery(query) + if err != nil { + return 0, err + } + + ms := time.Duration(int64(value)) * time.Millisecond + return ms, nil +} diff --git a/pkg/metrics/crossover_service_test.go b/pkg/metrics/crossover_service_test.go new file mode 100644 index 000000000..8d65bbab0 --- /dev/null +++ b/pkg/metrics/crossover_service_test.go @@ -0,0 +1,74 @@ +package metrics + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestCrossoverServiceObserver_GetRequestSuccessRate(t *testing.T) { + expected := ` sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", envoy_cluster_name="podinfo-canary", envoy_response_code!~"5.*" }[1m] ) ) / sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", envoy_cluster_name="podinfo-canary" }[1m] ) ) * 100` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + promql := r.URL.Query()["query"][0] + if promql != expected { + t.Errorf("\nGot %s \nWanted %s", promql, expected) + } + + json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1,"100"]}]}}` + w.Write([]byte(json)) + })) + defer ts.Close() + + client, err := NewPrometheusClient(ts.URL, time.Second) + if err != nil { + t.Fatal(err) + } + + observer := &CrossoverServiceObserver{ + client: client, + } + + val, err := observer.GetRequestSuccessRate("podinfo", "default", "1m") + if err != nil { + t.Fatal(err.Error()) + } + + if val != 100 { + t.Errorf("Got %v wanted %v", val, 100) + } +} + +func TestCrossoverServiceObserver_GetRequestDuration(t *testing.T) { + expected := ` histogram_quantile( 0.99, sum( rate( envoy_cluster_upstream_rq_time_bucket{ kubernetes_namespace="default", envoy_cluster_name="podinfo-canary" }[1m] ) ) by (le) )` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + promql := r.URL.Query()["query"][0] + if promql != expected { + t.Errorf("\nGot %s \nWanted %s", promql, expected) + } + + json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1,"100"]}]}}` + w.Write([]byte(json)) + })) + defer ts.Close() + + client, err := NewPrometheusClient(ts.URL, time.Second) + if err != nil { + t.Fatal(err) + } + + observer := &CrossoverServiceObserver{ + client: client, + } + + val, err := observer.GetRequestDuration("podinfo", "default", "1m") + if err != nil { + t.Fatal(err.Error()) + } + + if val != 100*time.Millisecond { + t.Errorf("Got %v wanted %v", val, 100*time.Millisecond) + } +} diff --git a/pkg/metrics/crossover_test.go b/pkg/metrics/crossover_test.go new file mode 100644 index 000000000..dd788a6f5 --- /dev/null +++ b/pkg/metrics/crossover_test.go @@ -0,0 +1,74 @@ +package metrics + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestCrossoverObserver_GetRequestSuccessRate(t *testing.T) { + expected := ` sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", envoy_cluster_name=~"podinfo-canary", envoy_response_code!~"5.*" }[1m] ) ) / sum( rate( envoy_cluster_upstream_rq{ kubernetes_namespace="default", envoy_cluster_name=~"podinfo-canary" }[1m] ) ) * 100` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + promql := r.URL.Query()["query"][0] + if promql != expected { + t.Errorf("\nGot %s \nWanted %s", promql, expected) + } + + json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1,"100"]}]}}` + w.Write([]byte(json)) + })) + defer ts.Close() + + client, err := NewPrometheusClient(ts.URL, time.Second) + if err != nil { + t.Fatal(err) + } + + observer := &CrossoverObserver{ + client: client, + } + + val, err := observer.GetRequestSuccessRate("podinfo", "default", "1m") + if err != nil { + t.Fatal(err.Error()) + } + + if val != 100 { + t.Errorf("Got %v wanted %v", val, 100) + } +} + +func TestCrossoverObserver_GetRequestDuration(t *testing.T) { + expected := ` histogram_quantile( 0.99, sum( rate( envoy_cluster_upstream_rq_time_bucket{ kubernetes_namespace="default", envoy_cluster_name=~"podinfo-canary" }[1m] ) ) by (le) )` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + promql := r.URL.Query()["query"][0] + if promql != expected { + t.Errorf("\nGot %s \nWanted %s", promql, expected) + } + + json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1,"100"]}]}}` + w.Write([]byte(json)) + })) + defer ts.Close() + + client, err := NewPrometheusClient(ts.URL, time.Second) + if err != nil { + t.Fatal(err) + } + + observer := &CrossoverObserver{ + client: client, + } + + val, err := observer.GetRequestDuration("podinfo", "default", "1m") + if err != nil { + t.Fatal(err.Error()) + } + + if val != 100*time.Millisecond { + t.Errorf("Got %v wanted %v", val, 100*time.Millisecond) + } +} diff --git a/pkg/metrics/factory.go b/pkg/metrics/factory.go index e2b69b8d8..be4f43149 100644 --- a/pkg/metrics/factory.go +++ b/pkg/metrics/factory.go @@ -6,19 +6,17 @@ import ( ) type Factory struct { - MeshProvider string - Client *PrometheusClient + Client *PrometheusClient } -func NewFactory(metricsServer string, meshProvider string, timeout time.Duration) (*Factory, error) { +func NewFactory(metricsServer string, timeout time.Duration) (*Factory, error) { client, err := NewPrometheusClient(metricsServer, timeout) if err != nil { return nil, err } return &Factory{ - MeshProvider: meshProvider, - Client: client, + Client: client, }, nil } @@ -33,7 +31,11 @@ func (factory Factory) Observer(provider string) Interface { client: factory.Client, } case provider == "appmesh": - return &EnvoyObserver{ + return &AppMeshObserver{ + client: factory.Client, + } + case provider == "crossover": + return &CrossoverObserver{ client: factory.Client, } case provider == "nginx": @@ -48,6 +50,10 @@ func (factory Factory) Observer(provider string) Interface { return &LinkerdObserver{ client: factory.Client, } + case provider == "crossover:service": + return &CrossoverServiceObserver{ + client: factory.Client, + } case provider == "linkerd": return &LinkerdObserver{ client: factory.Client,