From fe41335457a76f429177a24627044758773fe557 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Tue, 28 Dec 2021 20:11:12 +0300 Subject: [PATCH] feat: introduce new conditions in the `metalmachine` - `TalosConfigLoaded` is set to false when the config load has failed. - `TalosConfigValidated` is set to false when the config validation fails on the node. - `TalosInstalled` is set to true/false when talos installer finishes. All conditions are set by the adapter on the `ServerBinding`, then copied to the `MetalMachine`. Signed-off-by: Artem Chernyshev --- .../api/v1alpha3/metalmachine_conditions.go | 26 +++++++++++ .../api/v1alpha3/serverbinding_types.go | 15 +++++++ .../api/v1alpha3/zz_generated.deepcopy.go | 9 +++- ...cture.cluster.x-k8s.io_serverbindings.yaml | 45 +++++++++++++++++++ .../controllers/metalmachine_controller.go | 5 +++ .../cmd/events-manager/adapter.go | 44 +++++++++++++++--- hack/release.toml | 12 +++++ 7 files changed, 148 insertions(+), 8 deletions(-) diff --git a/app/caps-controller-manager/api/v1alpha3/metalmachine_conditions.go b/app/caps-controller-manager/api/v1alpha3/metalmachine_conditions.go index 036bbabe5..bcc990d41 100644 --- a/app/caps-controller-manager/api/v1alpha3/metalmachine_conditions.go +++ b/app/caps-controller-manager/api/v1alpha3/metalmachine_conditions.go @@ -18,3 +18,29 @@ const ( // to set ProviderID labels on all nodes. ProviderUpdateFailedReason = "ProviderUpdateFailed" ) + +const ( + // TalosConfigValidatedCondition reports when talos has loaded and validated the config + // for the machine. + TalosConfigValidatedCondition clusterv1.ConditionType = "TalosConfigValidated" + + // TalosConfigValidationFailedReason (Severity=Error) documents that Talos config validation has failed. + TalosConfigValidationFailedReason = "TalosConfigValidationFailed" +) + +const ( + // TalosConfigLoadedCondition reports when talos has loaded the config + // for the machine. + TalosConfigLoadedCondition clusterv1.ConditionType = "TalosConfigLoaded" + + // TalosConfigLoadedationFailedReason (Severity=Error) documents that Talos config validation has failed. + TalosConfigLoadFailedReason = "TalosConfigLoadFailed" +) + +const ( + // TalosInstalledCondition reports when Talos OS was successfully installed on the node. + TalosInstalledCondition clusterv1.ConditionType = "TalosInstalled" + + // TalosInstallationFailedReason (Severity=Error) documents that Talos installer has failed. + TalosInstallationFailedReason = "TalosInstallationFailed" +) diff --git a/app/caps-controller-manager/api/v1alpha3/serverbinding_types.go b/app/caps-controller-manager/api/v1alpha3/serverbinding_types.go index fe12eaeea..88fa2cce8 100644 --- a/app/caps-controller-manager/api/v1alpha3/serverbinding_types.go +++ b/app/caps-controller-manager/api/v1alpha3/serverbinding_types.go @@ -7,6 +7,7 @@ package v1alpha3 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ) // ServerBindingMetalMachineRefField is a reference to a field matching server binding to a metal machine. @@ -43,6 +44,10 @@ type ServerBindingState struct { // Ready is true when matching server is found. // +optional Ready bool `json:"ready"` + + // Conditions defines current state of the ServerBinding. + // +optional + Conditions clusterv1.Conditions `json:"conditions,omitempty"` } // +kubebuilder:object:root=true @@ -68,6 +73,16 @@ type ServerBinding struct { Status ServerBindingState `json:"status,omitempty"` } +// GetConditions returns the set of conditions for this object. +func (in *ServerBinding) GetConditions() clusterv1.Conditions { + return in.Status.Conditions +} + +// SetConditions sets the conditions on this object. +func (in *ServerBinding) SetConditions(conditions clusterv1.Conditions) { + in.Status.Conditions = conditions +} + // +kubebuilder:object:root=true // ServerBindingList contains a list of ServerBinding. diff --git a/app/caps-controller-manager/api/v1alpha3/zz_generated.deepcopy.go b/app/caps-controller-manager/api/v1alpha3/zz_generated.deepcopy.go index 0b7033b3d..c6930642c 100644 --- a/app/caps-controller-manager/api/v1alpha3/zz_generated.deepcopy.go +++ b/app/caps-controller-manager/api/v1alpha3/zz_generated.deepcopy.go @@ -344,7 +344,7 @@ func (in *ServerBinding) DeepCopyInto(out *ServerBinding) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerBinding. @@ -427,6 +427,13 @@ func (in *ServerBindingSpec) DeepCopy() *ServerBindingSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerBindingState) DeepCopyInto(out *ServerBindingState) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make(v1beta1.Conditions, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerBindingState. diff --git a/app/caps-controller-manager/config/crd/bases/infrastructure.cluster.x-k8s.io_serverbindings.yaml b/app/caps-controller-manager/config/crd/bases/infrastructure.cluster.x-k8s.io_serverbindings.yaml index 0778caea1..864cee9a3 100644 --- a/app/caps-controller-manager/config/crd/bases/infrastructure.cluster.x-k8s.io_serverbindings.yaml +++ b/app/caps-controller-manager/config/crd/bases/infrastructure.cluster.x-k8s.io_serverbindings.yaml @@ -214,6 +214,51 @@ spec: status: description: ServerBindingState defines the observed state of ServerBinding. properties: + conditions: + description: Conditions defines current state of the ServerBinding. + items: + description: Condition defines an observation of a Cluster API resource + operational state. + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. This should be when the underlying condition changed. + If that is not known, then using the time when the API field + changed is acceptable. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. This field may be empty. + type: string + reason: + description: The reason for the condition's last transition + in CamelCase. The specific API may choose whether or not this + field is considered a guaranteed API. This field may not be + empty. + type: string + severity: + description: Severity provides an explicit classification of + Reason code, so the users or machines can immediately understand + the current situation and act accordingly. The Severity field + MUST be set only when Status=False. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. + type: string + required: + - lastTransitionTime + - status + - type + type: object + type: array ready: description: Ready is true when matching server is found. type: boolean diff --git a/app/caps-controller-manager/controllers/metalmachine_controller.go b/app/caps-controller-manager/controllers/metalmachine_controller.go index b1924ebd7..8e53e3703 100644 --- a/app/caps-controller-manager/controllers/metalmachine_controller.go +++ b/app/caps-controller-manager/controllers/metalmachine_controller.go @@ -187,6 +187,11 @@ func (r *MetalMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request metalMachine.Status.Addresses = addresses metalMachine.Status.Ready = true + + // copy conditions from the server binding + for _, condition := range serverBinding.GetConditions() { + conditions.Set(metalMachine, &condition) + } } err = r.patchProviderID(ctx, cluster, metalMachine) diff --git a/app/sidero-controller-manager/cmd/events-manager/adapter.go b/app/sidero-controller-manager/cmd/events-manager/adapter.go index fc407e50e..46f3d3af9 100644 --- a/app/sidero-controller-manager/cmd/events-manager/adapter.go +++ b/app/sidero-controller-manager/cmd/events-manager/adapter.go @@ -19,6 +19,8 @@ import ( "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -93,15 +95,32 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error { case *machine.AddressEvent: fields = append(fields, zap.String("hostname", event.GetHostname()), zap.String("addresses", strings.Join(event.GetAddresses(), ","))) - if err = a.updateAddresses(ctx, ip, event); err != nil { + err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) { + serverbinding.Spec.Addresses = event.Addresses + serverbinding.Spec.Hostname = event.Hostname + }) + + if err != nil { a.logger.Error("failed to update server address", zap.Error(err)) return err } case *machine.ConfigValidationErrorEvent: fields = append(fields, zap.Error(fmt.Errorf(event.GetError()))) + + if err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) { + conditions.MarkFalse(serverbinding, sidero.TalosConfigValidatedCondition, sidero.TalosConfigValidationFailedReason, clusterv1.ConditionSeverityError, event.GetError()) + }); err != nil { + return err + } case *machine.ConfigLoadErrorEvent: fields = append(fields, zap.Error(fmt.Errorf(event.GetError()))) + + if err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) { + conditions.MarkFalse(serverbinding, sidero.TalosConfigLoadedCondition, sidero.TalosConfigLoadFailedReason, clusterv1.ConditionSeverityError, event.GetError()) + }); err != nil { + return err + } case *machine.PhaseEvent: fields = append(fields, zap.String("phase", event.GetPhase()), zap.String("action", event.GetAction().String())) case *machine.TaskEvent: @@ -118,13 +137,25 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error { if event.GetSequence() == "install" && event.GetAction() == machine.SequenceEvent_STOP { + var callback func(*sidero.ServerBinding) + if event.GetError() != nil { message = "failed to install Talos" - - break + callback = func(serverbinding *sidero.ServerBinding) { + conditions.MarkFalse(serverbinding, sidero.TalosInstalledCondition, sidero.TalosInstallationFailedReason, clusterv1.ConditionSeverityError, event.GetError().GetMessage()) + } + } else { + message = "successfully installed Talos" + callback = func(serverbinding *sidero.ServerBinding) { + conditions.MarkTrue(serverbinding, sidero.TalosInstalledCondition) + conditions.MarkTrue(serverbinding, sidero.TalosConfigValidatedCondition) + conditions.MarkTrue(serverbinding, sidero.TalosConfigLoadedCondition) + } } - message = "successfully installed Talos" + if e := a.patchServerBinding(ctx, ip, callback); e != nil { + return e + } } } @@ -141,7 +172,7 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error { return nil } -func (a *Adapter) updateAddresses(ctx context.Context, ip string, event *machine.AddressEvent) error { +func (a *Adapter) patchServerBinding(ctx context.Context, ip string, callback func(serverbinding *sidero.ServerBinding)) error { a.nodesMu.Lock() defer a.nodesMu.Unlock() @@ -160,8 +191,7 @@ func (a *Adapter) updateAddresses(ctx context.Context, ip string, event *machine return err } - serverbinding.Spec.Addresses = event.Addresses - serverbinding.Spec.Hostname = event.Hostname + callback(&serverbinding) return patchHelper.Patch(ctx, &serverbinding) } diff --git a/hack/release.toml b/hack/release.toml index edee88d45..c80ff5528 100644 --- a/hack/release.toml +++ b/hack/release.toml @@ -54,3 +54,15 @@ Which is then propagated to CAPI `Machine` resources. Requires Talos >= v0.14. """ + + [notes.conditions] + title = "New `MetalMachines` Conditions" + description = """\ +New set of conditions is now available which can simplify cluster troubleshooting: + +- `TalosConfigLoaded` is set to false when the config load has failed. +- `TalosConfigValidated` is set to false when the config validation +fails on the node. +- `TalosInstalled` is set to true/false when talos installer finishes. +""" +