diff --git a/mtest/cke-cluster.yml b/mtest/cke-cluster.yml index c7858207..a62ebcfe 100644 --- a/mtest/cke-cluster.yml +++ b/mtest/cke-cluster.yml @@ -28,6 +28,8 @@ repair: command_timeout_seconds: 30 need_drain: true watch_seconds: 30 + success_command: ["sh", "-c", "rm -f /tmp/mtest-repair-success-$1", "success"] + success_command_timeout_seconds: 30 health_check_command: ["sh", "-c", "test -f /tmp/mtest-repair-$1 && echo true", "health_check"] options: kube-api: diff --git a/mtest/repair_test.go b/mtest/repair_test.go index 668b6d70..07afd03d 100644 --- a/mtest/repair_test.go +++ b/mtest/repair_test.go @@ -62,6 +62,17 @@ func repairShouldNotProceed() { }).WithTimeout(time.Second * 60).Should(Succeed()) } +func repairSuccessCommandSuccess(node string) { + cmdSuccess := false + for _, host := range []string{host1, host2} { + _, _, err := execAt(host, "test", "-f", "/tmp/mtest-repair-success-"+node) + if err == nil { + cmdSuccess = true + } + } + Expect(cmdSuccess).To(BeTrue()) +} + func testRepairOperations() { // this will run: // - RepairDrainStartOp @@ -110,15 +121,34 @@ func testRepairOperations() { repairQueueAdd(node1) waitRepairSuccess(cluster) nodesShouldBeSchedulable(node1) + repairSuccessCommandSuccess(node1) ckecliSafe("repair-queue", "delete-finished") waitRepairEmpty(cluster) + By("setting erroneous success command") + originalSuceessCommand := cluster.Repair.RepairProcedures[0].RepairOperations[0].SuccessCommand + cluster.Repair.RepairProcedures[0].RepairOperations[0].SuccessCommand = []string{"false"} + _, err := ckecliClusterSet(cluster) + Expect(err).NotTo(HaveOccurred()) + time.Sleep(time.Second * 3) + + repairQueueAdd(node1) + waitRepairFailure(cluster) + + ckecliSafe("repair-queue", "delete-finished") + waitRepairEmpty(cluster) + + By("restoring success command") + cluster.Repair.RepairProcedures[0].RepairOperations[0].SuccessCommand = originalSuceessCommand + _, err = ckecliClusterSet(cluster) + Expect(err).NotTo(HaveOccurred()) + time.Sleep(time.Second * 3) + By("setting erroneous repair command") originalRepairCommand := cluster.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand - cluster.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand = []string{"false"} - _, err := ckecliClusterSet(cluster) + _, err = ckecliClusterSet(cluster) Expect(err).NotTo(HaveOccurred()) time.Sleep(time.Second * 3) diff --git a/op/repair_finish.go b/op/repair_finish.go index 9a7c0fe7..8aab9bca 100644 --- a/op/repair_finish.go +++ b/op/repair_finish.go @@ -76,24 +76,19 @@ func repairFinish(ctx context.Context, inf cke.Infrastructure, entry *cke.Repair return err } if op.SuccessCommand != nil { - err := func() error { - ctx := ctx - timeout := cke.DefaultRepairSuccessCommandTimeoutSeconds - if op.SuccessCommandTimeout != nil { - timeout = *op.SuccessCommandTimeout - } - if timeout != 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout)) - defer cancel() - } - args := append(op.SuccessCommand[1:], entry.Address) - command := well.CommandContext(ctx, op.SuccessCommand[0], args...) - return command.Run() - }() - if err != nil { - return err + ctx := ctx + timeout := cke.DefaultRepairSuccessCommandTimeoutSeconds + if op.SuccessCommandTimeout != nil { + timeout = *op.SuccessCommandTimeout } + if timeout != 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout)) + defer cancel() + } + args := append(op.SuccessCommand[1:], entry.Address) + command := well.CommandContext(ctx, op.SuccessCommand[0], args...) + return command.Run() } return nil }()