diff --git a/cmd/clusterctl/api/v1alpha3/annotations.go b/cmd/clusterctl/api/v1alpha3/annotations.go index eb818ee181fb..a9436dec3fc2 100644 --- a/cmd/clusterctl/api/v1alpha3/annotations.go +++ b/cmd/clusterctl/api/v1alpha3/annotations.go @@ -31,4 +31,14 @@ const ( // // It will help any validation webhook to take decision based on it. DeleteForMoveAnnotation = "clusterctl.cluster.x-k8s.io/delete-for-move" + + // BlockMoveAnnotation prevents the cluster move operation from starting if it is defined on at least one + // of the objects in scope. + // Provider controllers are expected to set the annotation on resources that cannot be instantaneously + // paused and remove the annotation when the resource has been actually paused. + // + // e.g. If this annotation is defined with any value on an InfraMachine resource to be moved when + // `clusterctl move` is invoked, then NO resources for ANY workload cluster will be created on the + // destination management cluster until the annotation is removed. + BlockMoveAnnotation = "clusterctl.cluster.x-k8s.io/block-move" ) diff --git a/cmd/clusterctl/client/cluster/mover.go b/cmd/clusterctl/client/cluster/mover.go index 433651d7d4bd..5b9ba98dc4cc 100644 --- a/cmd/clusterctl/client/cluster/mover.go +++ b/cmd/clusterctl/client/cluster/mover.go @@ -21,6 +21,7 @@ import ( "fmt" "os" "path/filepath" + "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -32,6 +33,7 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/version" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -333,6 +335,19 @@ func (o *objectMover) move(graph *objectGraph, toProxy Proxy, mutators ...Resour return errors.Wrap(err, "error pausing ClusterClasses") } + log.Info("Waiting for all resources to be ready to move") + // exponential backoff configuration which returns durations for a total time of ~2m. + // Example: 0, 5s, 8s, 11s, 17s, 26s, 38s, 57s, 86s, 128s + waitForMoveUnblockedBackoff := wait.Backoff{ + Duration: 5 * time.Second, + Factor: 1.5, + Steps: 10, + Jitter: 0.1, + } + if err := waitReadyForMove(o.fromProxy, graph.getMoveNodes(), o.dryRun, waitForMoveUnblockedBackoff); err != nil { + return errors.Wrap(err, "error waiting for resources to be ready to move") + } + // Nb. DO NOT call ensureNamespaces at this point because: // - namespace will be ensured to exist before creating the resource. // - If it's done here, we might create a namespace that can end up unused on target cluster (due to mutators). @@ -595,6 +610,55 @@ func setClusterClassPause(proxy Proxy, clusterclasses []*node, pause bool, dryRu return nil } +func waitReadyForMove(proxy Proxy, nodes []*node, dryRun bool, backoff wait.Backoff) error { + if dryRun { + return nil + } + + log := logf.Log + + c, err := proxy.NewClient() + if err != nil { + return errors.Wrap(err, "error creating client") + } + + for _, n := range nodes { + obj := &metav1.PartialObjectMetadata{ + ObjectMeta: metav1.ObjectMeta{ + Name: n.identity.Name, + Namespace: n.identity.Namespace, + }, + TypeMeta: metav1.TypeMeta{ + APIVersion: n.identity.APIVersion, + Kind: n.identity.Kind, + }, + } + key := client.ObjectKeyFromObject(obj) + log := log.WithValues("apiVersion", obj.GroupVersionKind(), "resource", klog.KObj(obj)) + + blockLogged := false + if err := retryWithExponentialBackoff(backoff, func() error { + if err := c.Get(ctx, key, obj); err != nil { + return errors.Wrapf(err, "error getting %s/%s", obj.GroupVersionKind(), key) + } + + if _, exists := obj.GetAnnotations()[clusterctlv1.BlockMoveAnnotation]; exists { + if !blockLogged { + log.Info(fmt.Sprintf("Move blocked by %s annotation, waiting for it to be removed", clusterctlv1.BlockMoveAnnotation)) + blockLogged = true + } + return errors.Errorf("resource is not ready to move: %s/%s", obj.GroupVersionKind(), key) + } + log.V(5).Info("Resource is ready to move") + return nil + }); err != nil { + return err + } + } + + return nil +} + // patchCluster applies a patch to a node referring to a Cluster object. func patchCluster(proxy Proxy, n *node, patch client.Patch, mutators ...ResourceMutatorFunc) error { cFrom, err := proxy.NewClient() diff --git a/cmd/clusterctl/client/cluster/mover_test.go b/cmd/clusterctl/client/cluster/mover_test.go index a1995f4772e5..823ffb432990 100644 --- a/cmd/clusterctl/client/cluster/mover_test.go +++ b/cmd/clusterctl/client/cluster/mover_test.go @@ -17,6 +17,7 @@ limitations under the License. package cluster import ( + "context" "fmt" "os" "path/filepath" @@ -29,7 +30,9 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" @@ -2288,3 +2291,69 @@ func Test_deleteSourceObject(t *testing.T) { }) } } + +func TestWaitReadyForMove(t *testing.T) { + tests := []struct { + name string + moveBlocked bool + wantErr bool + }{ + { + name: "moving blocked cluster should fail", + moveBlocked: true, + wantErr: true, + }, + { + name: "moving unblocked cluster should succeed", + moveBlocked: false, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + clusterName := "foo" + clusterNamespace := "ns1" + objs := test.NewFakeCluster(clusterNamespace, clusterName).Objs() + + // Create an objectGraph bound a source cluster with all the CRDs for the types involved in the test. + graph := getObjectGraphWithObjs(objs) + + if tt.moveBlocked { + c, err := graph.proxy.NewClient() + g.Expect(err).NotTo(HaveOccurred()) + + ctx := context.Background() + cluster := &clusterv1.Cluster{} + err = c.Get(ctx, types.NamespacedName{Namespace: clusterNamespace, Name: clusterName}, cluster) + g.Expect(err).NotTo(HaveOccurred()) + anns := cluster.GetAnnotations() + if anns == nil { + anns = make(map[string]string) + } + anns[clusterctlv1.BlockMoveAnnotation] = "anything" + cluster.SetAnnotations(anns) + + g.Expect(c.Update(ctx, cluster)).To(Succeed()) + } + + // Get all the types to be considered for discovery + g.Expect(getFakeDiscoveryTypes(graph)).To(Succeed()) + + // trigger discovery the content of the source cluster + g.Expect(graph.Discovery("")).To(Succeed()) + + backoff := wait.Backoff{ + Steps: 1, + } + err := waitReadyForMove(graph.proxy, graph.getMoveNodes(), false, backoff) + if tt.wantErr { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).NotTo(HaveOccurred()) + } + }) + } +} diff --git a/docs/book/src/clusterctl/commands/move.md b/docs/book/src/clusterctl/commands/move.md index 5c8f1ff1abb2..f8dfff4aeab9 100644 --- a/docs/book/src/clusterctl/commands/move.md +++ b/docs/book/src/clusterctl/commands/move.md @@ -32,6 +32,8 @@ The discovery mechanism for determining the objects to be moved is in the [provi Before moving a `Cluster`, clusterctl sets the `Cluster.Spec.Paused` field to `true` stopping the controllers from reconciling the workload cluster _in the source management cluster_. +clusterctl will wait until the `clusterctl.cluster.x-k8s.io/block-move` annotation is not +present on any resource targeted by the move operation. The `Cluster` object created in the target management cluster instead will be actively reconciled as soon as the move process completes. diff --git a/docs/book/src/clusterctl/provider-contract.md b/docs/book/src/clusterctl/provider-contract.md index 2f45e2931e54..13b3b627866f 100644 --- a/docs/book/src/clusterctl/provider-contract.md +++ b/docs/book/src/clusterctl/provider-contract.md @@ -469,7 +469,9 @@ If moving some of excluded object is required, the provider authors should creat exact move sequence to be executed by the user. Additionally, provider authors should be aware that `clusterctl move` assumes all the provider's Controllers respect the -`Cluster.Spec.Paused` field introduced in the v1alpha3 Cluster API specification. +`Cluster.Spec.Paused` field introduced in the v1alpha3 Cluster API specification. If a provider needs to perform extra work in response to a +cluster being paused, `clusterctl move` can be blocked from creating any resources on the destination +management cluster by annotating any resource to be moved with `clusterctl.cluster.x-k8s.io/block-move`.