Skip to content

Commit

Permalink
feat: introduce new conditions in the metalmachine
Browse files Browse the repository at this point in the history
- `TalosConfigLoaded` is set to false when the config load has failed.
- `TalosConfigValidated` is set to false when the config validation
fails on the node.
- `TalosInstalled` is set to true/false when talos installer finishes.

All conditions are set by the adapter on the `ServerBinding`, then
copied to the `MetalMachine`.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Dec 29, 2021
1 parent 6454dee commit fe41335
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,29 @@ const (
// to set ProviderID labels on all nodes.
ProviderUpdateFailedReason = "ProviderUpdateFailed"
)

const (
// TalosConfigValidatedCondition reports when talos has loaded and validated the config
// for the machine.
TalosConfigValidatedCondition clusterv1.ConditionType = "TalosConfigValidated"

// TalosConfigValidationFailedReason (Severity=Error) documents that Talos config validation has failed.
TalosConfigValidationFailedReason = "TalosConfigValidationFailed"
)

const (
// TalosConfigLoadedCondition reports when talos has loaded the config
// for the machine.
TalosConfigLoadedCondition clusterv1.ConditionType = "TalosConfigLoaded"

// TalosConfigLoadedationFailedReason (Severity=Error) documents that Talos config validation has failed.
TalosConfigLoadFailedReason = "TalosConfigLoadFailed"
)

const (
// TalosInstalledCondition reports when Talos OS was successfully installed on the node.
TalosInstalledCondition clusterv1.ConditionType = "TalosInstalled"

// TalosInstallationFailedReason (Severity=Error) documents that Talos installer has failed.
TalosInstallationFailedReason = "TalosInstallationFailed"
)
15 changes: 15 additions & 0 deletions app/caps-controller-manager/api/v1alpha3/serverbinding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package v1alpha3
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
)

// ServerBindingMetalMachineRefField is a reference to a field matching server binding to a metal machine.
Expand Down Expand Up @@ -43,6 +44,10 @@ type ServerBindingState struct {
// Ready is true when matching server is found.
// +optional
Ready bool `json:"ready"`

// Conditions defines current state of the ServerBinding.
// +optional
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
}

// +kubebuilder:object:root=true
Expand All @@ -68,6 +73,16 @@ type ServerBinding struct {
Status ServerBindingState `json:"status,omitempty"`
}

// GetConditions returns the set of conditions for this object.
func (in *ServerBinding) GetConditions() clusterv1.Conditions {
return in.Status.Conditions
}

// SetConditions sets the conditions on this object.
func (in *ServerBinding) SetConditions(conditions clusterv1.Conditions) {
in.Status.Conditions = conditions
}

// +kubebuilder:object:root=true

// ServerBindingList contains a list of ServerBinding.
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,51 @@ spec:
status:
description: ServerBindingState defines the observed state of ServerBinding.
properties:
conditions:
description: Conditions defines current state of the ServerBinding.
items:
description: Condition defines an observation of a Cluster API resource
operational state.
properties:
lastTransitionTime:
description: Last time the condition transitioned from one status
to another. This should be when the underlying condition changed.
If that is not known, then using the time when the API field
changed is acceptable.
format: date-time
type: string
message:
description: A human readable message indicating details about
the transition. This field may be empty.
type: string
reason:
description: The reason for the condition's last transition
in CamelCase. The specific API may choose whether or not this
field is considered a guaranteed API. This field may not be
empty.
type: string
severity:
description: Severity provides an explicit classification of
Reason code, so the users or machines can immediately understand
the current situation and act accordingly. The Severity field
MUST be set only when Status=False.
type: string
status:
description: Status of the condition, one of True, False, Unknown.
type: string
type:
description: Type of condition in CamelCase or in foo.example.com/CamelCase.
Many .condition.type values are consistent across resources
like Available, but because arbitrary conditions can be useful
(see .node.status.conditions), the ability to deconflict is
important.
type: string
required:
- lastTransitionTime
- status
- type
type: object
type: array
ready:
description: Ready is true when matching server is found.
type: boolean
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ func (r *MetalMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request

metalMachine.Status.Addresses = addresses
metalMachine.Status.Ready = true

// copy conditions from the server binding
for _, condition := range serverBinding.GetConditions() {
conditions.Set(metalMachine, &condition)
}
}

err = r.patchProviderID(ctx, cluster, metalMachine)
Expand Down
44 changes: 37 additions & 7 deletions app/sidero-controller-manager/cmd/events-manager/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
"k8s.io/client-go/dynamic/dynamicinformer"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -93,15 +95,32 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error {
case *machine.AddressEvent:
fields = append(fields, zap.String("hostname", event.GetHostname()), zap.String("addresses", strings.Join(event.GetAddresses(), ",")))

if err = a.updateAddresses(ctx, ip, event); err != nil {
err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) {
serverbinding.Spec.Addresses = event.Addresses
serverbinding.Spec.Hostname = event.Hostname
})

if err != nil {
a.logger.Error("failed to update server address", zap.Error(err))

return err
}
case *machine.ConfigValidationErrorEvent:
fields = append(fields, zap.Error(fmt.Errorf(event.GetError())))

if err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) {
conditions.MarkFalse(serverbinding, sidero.TalosConfigValidatedCondition, sidero.TalosConfigValidationFailedReason, clusterv1.ConditionSeverityError, event.GetError())
}); err != nil {
return err
}
case *machine.ConfigLoadErrorEvent:
fields = append(fields, zap.Error(fmt.Errorf(event.GetError())))

if err = a.patchServerBinding(ctx, ip, func(serverbinding *sidero.ServerBinding) {
conditions.MarkFalse(serverbinding, sidero.TalosConfigLoadedCondition, sidero.TalosConfigLoadFailedReason, clusterv1.ConditionSeverityError, event.GetError())
}); err != nil {
return err
}
case *machine.PhaseEvent:
fields = append(fields, zap.String("phase", event.GetPhase()), zap.String("action", event.GetAction().String()))
case *machine.TaskEvent:
Expand All @@ -118,13 +137,25 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error {

if event.GetSequence() == "install" &&
event.GetAction() == machine.SequenceEvent_STOP {
var callback func(*sidero.ServerBinding)

if event.GetError() != nil {
message = "failed to install Talos"

break
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkFalse(serverbinding, sidero.TalosInstalledCondition, sidero.TalosInstallationFailedReason, clusterv1.ConditionSeverityError, event.GetError().GetMessage())
}
} else {
message = "successfully installed Talos"
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkTrue(serverbinding, sidero.TalosInstalledCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigValidatedCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigLoadedCondition)
}
}

message = "successfully installed Talos"
if e := a.patchServerBinding(ctx, ip, callback); e != nil {
return e
}
}
}

Expand All @@ -141,7 +172,7 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error {
return nil
}

func (a *Adapter) updateAddresses(ctx context.Context, ip string, event *machine.AddressEvent) error {
func (a *Adapter) patchServerBinding(ctx context.Context, ip string, callback func(serverbinding *sidero.ServerBinding)) error {
a.nodesMu.Lock()
defer a.nodesMu.Unlock()

Expand All @@ -160,8 +191,7 @@ func (a *Adapter) updateAddresses(ctx context.Context, ip string, event *machine
return err
}

serverbinding.Spec.Addresses = event.Addresses
serverbinding.Spec.Hostname = event.Hostname
callback(&serverbinding)

return patchHelper.Patch(ctx, &serverbinding)
}
Expand Down
12 changes: 12 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,15 @@ Which is then propagated to CAPI `Machine` resources.
Requires Talos >= v0.14.
"""

[notes.conditions]
title = "New `MetalMachines` Conditions"
description = """\
New set of conditions is now available which can simplify cluster troubleshooting:
- `TalosConfigLoaded` is set to false when the config load has failed.
- `TalosConfigValidated` is set to false when the config validation
fails on the node.
- `TalosInstalled` is set to true/false when talos installer finishes.
"""

0 comments on commit fe41335

Please sign in to comment.