Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support gated rollback #420

Merged
merged 5 commits into from
Feb 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,4 @@ If you have any questions about Flagger and progressive delivery:
hands-on training and meetups in your area.
* File an [issue](https://github.com/weaveworks/flagger/issues/new).

Your feedback is always welcome!
Your feedback is always welcome!
1 change: 1 addition & 0 deletions artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ spec:
- confirm-promotion
- post-rollout
- event
- rollback
url:
description: URL address of this webhook
type: string
Expand Down
1 change: 1 addition & 0 deletions charts/flagger/templates/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ spec:
- confirm-promotion
- post-rollout
- event
- rollback
url:
description: URL address of this webhook
type: string
Expand Down
20 changes: 20 additions & 0 deletions docs/gitbook/how-it-works.md
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,9 @@ The canary promotion is paused until the hooks return HTTP 200.
While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks.
* Post-rollout hooks are executed after the canary has been promoted or rolled back.
If a post rollout hook fails the error is logged.
* Rollback hooks are executed while a canary deployment is in either Progressing or Waiting status.
This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook
returns a successful HTTP status code, Flagger will rollback the canary deployment.
* Event hooks are executed every time Flagger emits a Kubernetes event. When configured,
every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request.

Expand Down Expand Up @@ -584,6 +587,9 @@ Spec:
timeout: 5s
metadata:
some: "message"
- name: "rollback gate"
type: rollback
url: http://flagger-loadtester.test/gate/halt
- name: "send to Slack"
type: event
url: http://event-recevier.notifications/slack
Expand Down Expand Up @@ -830,6 +836,10 @@ For manual approval of a canary deployment you can use the `confirm-rollout` and
The confirmation rollout hooks are executed before the pre-rollout hooks.
Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200.

For manual rollback of a canary deployment you can use the `rollback` webhook. The rollback hook will be called
during the analysis and confirmation states. If a rollback webhook returns a successful HTTP status code, Flagger
will shift all traffic back to the primary instance and fail the canary.

Manual gating with Flagger's tester:

```yaml
Expand Down Expand Up @@ -898,4 +908,14 @@ While the promotion is paused, Flagger will continue to run the metrics checks a
url: http://flagger-loadtester.test/gate/halt
```

The `rollback` hook type can be used to manually rollback the canary promotion.

```yaml
canaryAnalysis:
webhooks:
- name: "rollback"
type: rollback
url: http://flagger-loadtester.test/gate/halt
```

If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary promotion is waiting for approval.
1 change: 1 addition & 0 deletions kustomize/base/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ spec:
- confirm-promotion
- post-rollout
- event
- rollback
url:
description: URL address of this webhook
type: string
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/flagger/v1alpha3/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ const (
ConfirmPromotionHook HookType = "confirm-promotion"
// EventHook dispatches Flagger events to the specified endpoint
EventHook HookType = "event"
// RollbackHook rollback canary anaylysis if webhook returns HTTP 200
RollbackHook HookType = "rollback"
)

// CanaryWebhook holds the reference to external checks used for canary analysis
Expand Down
104 changes: 66 additions & 38 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,17 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh
return
}

// check if we should rollback
if cd.Status.Phase == flaggerv1.CanaryPhaseProgressing ||
cd.Status.Phase == flaggerv1.CanaryPhaseWaiting {
if ok := c.runRollbackHooks(cd, cd.Status.Phase); ok {
c.recordEventWarningf(cd, "Rolling back %s.%s manual webhook invoked", cd.Name, cd.Namespace)
c.sendNotification(cd, "Rolling back manual webhook invoked", false, true)
c.rollback(cd, canaryController, meshRouter)
return
}
}

// route all traffic to primary if analysis has succeeded
if cd.Status.Phase == flaggerv1.CanaryPhasePromoting {
if provider != "kubernetes" {
Expand Down Expand Up @@ -267,50 +278,13 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh
// check if the number of failed checks reached the threshold
if cd.Status.Phase == flaggerv1.CanaryPhaseProgressing &&
(!retriable || cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold) {

if cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold {
c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v",
cd.Name, cd.Namespace, cd.Status.FailedChecks)
c.sendNotification(cd, fmt.Sprintf("Failed checks threshold reached %v", cd.Status.FailedChecks),
false, true)
}

if !retriable {
c.recordEventWarningf(cd, "Rolling back %s.%s progress deadline exceeded %v",
cd.Name, cd.Namespace, err)
c.sendNotification(cd, fmt.Sprintf("Progress deadline exceeded %v", err),
false, true)
}

// route all traffic back to primary
primaryWeight = 100
canaryWeight = 0
if err := meshRouter.SetRoutes(cd, primaryWeight, canaryWeight, false); err != nil {
c.recordEventWarningf(cd, "%v", err)
return
}

canaryPhaseFailed := cd.DeepCopy()
canaryPhaseFailed.Status.Phase = flaggerv1.CanaryPhaseFailed
c.recordEventWarningf(canaryPhaseFailed, "Canary failed! Scaling down %s.%s",
canaryPhaseFailed.Name, canaryPhaseFailed.Namespace)

c.recorder.SetWeight(cd, primaryWeight, canaryWeight)

// shutdown canary
if err := canaryController.Scale(cd, 0); err != nil {
c.recordEventWarningf(cd, "%v", err)
return
}

// mark canary as failed
if err := canaryController.SyncStatus(cd, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseFailed, CanaryWeight: 0}); err != nil {
c.logger.With("canary", fmt.Sprintf("%s.%s", cd.Name, cd.Namespace)).Errorf("%v", err)
return
}

c.recorder.SetStatus(cd, flaggerv1.CanaryPhaseFailed)
c.runPostRolloutHooks(cd, flaggerv1.CanaryPhaseFailed)
c.rollback(cd, canaryController, meshRouter)
return
}

Expand Down Expand Up @@ -757,6 +731,21 @@ func (c *Controller) runPostRolloutHooks(canary *flaggerv1.Canary, phase flagger
return true
}

func (c *Controller) runRollbackHooks(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) bool {
for _, webhook := range canary.Spec.CanaryAnalysis.Webhooks {
if webhook.Type == flaggerv1.RollbackHook {
err := CallWebhook(canary.Name, canary.Namespace, phase, webhook)
if err != nil {
c.recordEventInfof(canary, "Rollback hook %s not signaling a rollback", webhook.Name)
} else {
c.recordEventWarningf(canary, "Rollback check %s passed", webhook.Name)
return true
}
}
}
return false
}

func (c *Controller) runAnalysis(r *flaggerv1.Canary) bool {
// run external checks
for _, webhook := range r.Spec.CanaryAnalysis.Webhooks {
Expand Down Expand Up @@ -878,3 +867,42 @@ func (c *Controller) runAnalysis(r *flaggerv1.Canary) bool {

return true
}

func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary.Controller, meshRouter router.Interface) {
if canary.Status.FailedChecks >= canary.Spec.CanaryAnalysis.Threshold {
c.recordEventWarningf(canary, "Rolling back %s.%s failed checks threshold reached %v",
canary.Name, canary.Namespace, canary.Status.FailedChecks)
c.sendNotification(canary, fmt.Sprintf("Failed checks threshold reached %v", canary.Status.FailedChecks),
false, true)
}

// route all traffic back to primary
primaryWeight := 100
canaryWeight := 0
if err := meshRouter.SetRoutes(canary, primaryWeight, canaryWeight, false); err != nil {
c.recordEventWarningf(canary, "%v", err)
return
}

canaryPhaseFailed := canary.DeepCopy()
canaryPhaseFailed.Status.Phase = flaggerv1.CanaryPhaseFailed
c.recordEventWarningf(canaryPhaseFailed, "Canary failed! Scaling down %s.%s",
canaryPhaseFailed.Name, canaryPhaseFailed.Namespace)

c.recorder.SetWeight(canary, primaryWeight, canaryWeight)

// shutdown canary
if err := canaryController.Scale(canary, 0); err != nil {
c.recordEventWarningf(canary, "%v", err)
return
}

// mark canary as failed
if err := canaryController.SyncStatus(canary, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseFailed, CanaryWeight: 0}); err != nil {
c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err)
return
}

c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed)
c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed)
}