Skip to content

Commit

Permalink
feat: set server PXEBooted condition only after Talos gets installed
Browse files Browse the repository at this point in the history
Now `PXEBooted` condition is derived from `TalosInstalled` condition of
the linked `ServerBinding`.
This makes the node to use `pxe` boot until Talos installation succeeds.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Jan 17, 2022
1 parent 34f7822 commit b30fbe4
Show file tree
Hide file tree
Showing 15 changed files with 411 additions and 24 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ MODULE := $(shell head -1 go.mod | cut -d' ' -f2)
ARTIFACTS := _out
TEST_PKGS ?= ./...
TALOS_RELEASE ?= v0.14.0-alpha.2
PREVIOUS_TALOS_RELEASE ?= v0.13.4
DEFAULT_K8S_VERSION ?= v1.22.3

TOOLS ?= ghcr.io/talos-systems/tools:v0.9.0
Expand Down Expand Up @@ -171,6 +172,7 @@ run-sfyra: talos-artifacts clusterctl-release ## Run Sfyra integration test.
@ARTIFACTS=$(ARTIFACTS) \
CLUSTERCTL_CONFIG=$(SFYRA_CLUSTERCTL_CONFIG) \
TALOS_RELEASE=$(TALOS_RELEASE) \
PREVIOUS_TALOS_RELEASE=$(PREVIOUS_TALOS_RELEASE) \
./hack/scripts/integration-test.sh

# Development
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ const (
// TalosInstalledCondition reports when Talos OS was successfully installed on the node.
TalosInstalledCondition clusterv1.ConditionType = "TalosInstalled"

// TalosInstallationInProgressReason (Severity=Info) documents that Talos installation is in progress.
TalosInstallationInProgressReason = "TalosInstallationInProgress"

// TalosInstallationFailedReason (Severity=Error) documents that Talos installer has failed.
TalosInstallationFailedReason = "TalosInstallationFailed"
)
25 changes: 16 additions & 9 deletions app/sidero-controller-manager/cmd/events-manager/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,19 +162,26 @@ func (a *Adapter) HandleEvent(ctx context.Context, event events.Event) error {
}

func (a *Adapter) handleSequenceEvent(ctx context.Context, ip string, event *machine.SequenceEvent) error {
if event.GetSequence() == "install" &&
event.GetAction() == machine.SequenceEvent_STOP {
if event.GetSequence() == "install" {
var callback func(*sidero.ServerBinding)

if event.GetError() != nil {
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkFalse(serverbinding, sidero.TalosInstalledCondition, sidero.TalosInstallationFailedReason, clusterv1.ConditionSeverityError, event.GetError().GetMessage())
if event.GetAction() == machine.SequenceEvent_STOP {
if event.GetError() != nil {
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkFalse(serverbinding, sidero.TalosInstalledCondition, sidero.TalosInstallationFailedReason, clusterv1.ConditionSeverityError, event.GetError().GetMessage())
}
} else {
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkTrue(serverbinding, sidero.TalosInstalledCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigValidatedCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigLoadedCondition)
}
}
} else {
} else if event.GetAction() == machine.SequenceEvent_START {
callback = func(serverbinding *sidero.ServerBinding) {
conditions.MarkTrue(serverbinding, sidero.TalosInstalledCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigValidatedCondition)
conditions.MarkTrue(serverbinding, sidero.TalosConfigLoadedCondition)
conditions.MarkFalse(serverbinding, sidero.TalosInstalledCondition, sidero.TalosInstallationInProgressReason, clusterv1.ConditionSeverityInfo, "")
conditions.MarkFalse(serverbinding, sidero.TalosConfigValidatedCondition, sidero.TalosInstallationInProgressReason, clusterv1.ConditionSeverityInfo, "")
conditions.MarkFalse(serverbinding, sidero.TalosConfigLoadedCondition, sidero.TalosInstallationInProgressReason, clusterv1.ConditionSeverityInfo, "")
}
}

Expand Down
26 changes: 17 additions & 9 deletions app/sidero-controller-manager/controllers/server_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func (r *ServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return result, nil
}

allocated, serverBindingPresent, err := r.checkBinding(ctx, req)
allocated, serverBinding, err := r.getServerBinding(ctx, req)
if err != nil {
return ctrl.Result{}, err
}
Expand All @@ -132,9 +132,14 @@ func (r *ServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
s.Status.InUse = true
s.Status.IsClean = false

if serverBindingPresent {
if serverBinding != nil {
// clear any leftover ownerreferences, they were transferred by serverbinding controller
s.OwnerReferences = []v1.OwnerReference{}

// Talos installation was successful, so mark the server as PXE booted.
if conditions.IsTrue(serverBinding, infrav1.TalosInstalledCondition) {
conditions.MarkTrue(serverBinding, metalv1alpha1.ConditionPXEBooted)
}
}
}

Expand Down Expand Up @@ -285,23 +290,26 @@ func (r *ServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return f(false, ctrl.Result{})
}

func (r *ServerReconciler) checkBinding(ctx context.Context, req ctrl.Request) (allocated, serverBindingPresent bool, err error) {
var serverBinding infrav1.ServerBinding
func (r *ServerReconciler) getServerBinding(ctx context.Context, req ctrl.Request) (bool, *infrav1.ServerBinding, error) {
var (
serverBinding infrav1.ServerBinding
err error
)

err = r.Get(ctx, req.NamespacedName, &serverBinding)
if err == nil {
return true, true, nil
return true, &serverBinding, nil
}

if err != nil && !apierrors.IsNotFound(err) {
return false, false, err
return false, nil, err
}

// double-check metalmachines to make sure we don't have a missing serverbinding
var metalMachineList infrav1.MetalMachineList

if err := r.List(ctx, &metalMachineList, client.MatchingFields(fields.Set{infrav1.MetalMachineServerRefField: req.Name})); err != nil {
return false, false, err
return false, nil, err
}

for _, metalMachine := range metalMachineList.Items {
Expand All @@ -311,12 +319,12 @@ func (r *ServerReconciler) checkBinding(ctx context.Context, req ctrl.Request) (

if metalMachine.Spec.ServerRef != nil {
if metalMachine.Spec.ServerRef.Namespace == req.Namespace && metalMachine.Spec.ServerRef.Name == req.Name {
return true, false, nil
return true, nil, nil
}
}
}

return false, false, nil
return false, nil, nil
}

func (r *ServerReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
Expand Down
8 changes: 8 additions & 0 deletions app/sidero-controller-manager/internal/ipxe/ipxe_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,15 @@ func ipxeHandler(w http.ResponseWriter, r *http.Request) {
return
}

// This code is left here only for backward compatibility with Talos <= v0.13.
if !strings.HasPrefix(env.ObjectMeta.Name, "agent") {
// Do not mark as PXE booted here if SideroLink events are available and Talos installation is in progress.
// SideroLink events handler will mark the machine with TalosInstalledCondition condition,
// then server controller will reconcile this status and mark server as PXEBooted.
if conditions.Has(serverBinding, infrav1.TalosInstalledCondition) {
return
}

if err = markAsPXEBooted(server); err != nil {
log.Printf("error marking server as PXE booted: %s", err)
}
Expand Down
6 changes: 6 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,9 @@ fails on the node.
- `TalosInstalled` is set to true/false when talos installer finishes.
"""

[notes.pxeboot]
title = "Retry PXE Boot"
description = """\
Sidero server controller now keeps track of Talos installation progress.
Now the node will be PXE booted until Talos installation succeeds.
"""
8 changes: 7 additions & 1 deletion hack/scripts/integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,15 @@ else
PREFIX=
fi

COMPATIBILITY_TESTS_ARGS=

if [[ ! -z "${PREVIOUS_TALOS_RELEASE}" ]]; then
COMPATIBILITY_TESTS_ARGS="--prev-talos-release=${PREVIOUS_TALOS_RELEASE}"
fi

${PREFIX} "${INTEGRATION_TEST}" test integration \
--talosctl-path "${TALOSCTL}" \
--clusterctl-config "${CLUSTERCTL_CONFIG}" \
--power-simulated-explicit-failure-prob=0.1 \
--power-simulated-silent-failure-prob=0.0 \
${REGISTRY_MIRROR_FLAGS} ${SFYRA_EXTRA_FLAGS}
${COMPATIBILITY_TESTS_ARGS} ${REGISTRY_MIRROR_FLAGS} ${SFYRA_EXTRA_FLAGS}
3 changes: 2 additions & 1 deletion sfyra/cmd/sfyra/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ type Options struct {

DefaultBootOrder string

TalosctlPath string
TalosctlPath string
PrevTalosRelease string

PowerSimulatedExplicitFailureProb float64
PowerSimulatedSilentFailureProb float64
Expand Down
2 changes: 2 additions & 0 deletions sfyra/cmd/sfyra/cmd/test_integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var testIntegrationCmd = &cobra.Command{
RunTestPattern: runTestPattern,

TalosRelease: TalosRelease,
PrevTalosRelease: options.PrevTalosRelease,
KubernetesVersion: KubernetesVersion,
}); !ok {
return fmt.Errorf("test failure")
Expand Down Expand Up @@ -144,4 +145,5 @@ func init() {
testIntegrationCmd.Flags().Float64Var(&options.PowerSimulatedExplicitFailureProb, "power-simulated-explicit-failure-prob", options.PowerSimulatedExplicitFailureProb, "simulated power management explicit failure probability")
testIntegrationCmd.Flags().Float64Var(&options.PowerSimulatedSilentFailureProb, "power-simulated-silent-failure-prob", options.PowerSimulatedSilentFailureProb, "simulated power management silent failure probability")
testIntegrationCmd.Flags().StringVar(&runTestPattern, "test.run", "", "tests to run (regular expression)")
testIntegrationCmd.Flags().StringVar(&options.PrevTalosRelease, "prev-talos-release", options.PrevTalosRelease, "Talos version to run compatibility tests against")
}
44 changes: 41 additions & 3 deletions sfyra/pkg/tests/cluster_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,28 @@ import (
"github.com/talos-systems/sidero/sfyra/pkg/vm"
)

func deployCluster(ctx context.Context, t *testing.T, metalClient client.Client, capiCluster talos.Cluster, vmSet *vm.Set,
capiManager *capi.Manager, clusterName, serverClassName string, loadbalancerPort int, controlPlaneNodes, workerNodes int64, talosVersion, kubernetesVersion string) (*loadbalancer.ControlPlane, *capi.Cluster) {
type clusterOptions struct {
configURL string
}

type clusterOption func(o *clusterOptions)

func withConfigURL(value string) clusterOption {
return func(o *clusterOptions) {
o.configURL = value
}
}

// createCluster without waiting for it to become ready.
func createCluster(ctx context.Context, t *testing.T, metalClient client.Client, capiCluster talos.Cluster, vmSet *vm.Set,
capiManager *capi.Manager, clusterName, serverClassName string, loadbalancerPort int, controlPlaneNodes, workerNodes int64, talosVersion, kubernetesVersion string, options ...clusterOption) *loadbalancer.ControlPlane {
t.Logf("deploying cluster %q from server class %q with loadbalancer port %d", clusterName, serverClassName, loadbalancerPort)

var opts clusterOptions
for _, o := range options {
o(&opts)
}

kubeconfig, err := capiManager.GetKubeconfig(ctx)
require.NoError(t, err)

Expand All @@ -60,6 +78,12 @@ func deployCluster(ctx context.Context, t *testing.T, metalClient client.Client,
WorkerMachineCount: &workerNodes,
}

if opts.configURL != "" {
templateOptions.URLSource = &capiclient.URLSourceOptions{
URL: opts.configURL,
}
}

template, err := capiClient.GetClusterTemplate(templateOptions)
require.NoError(t, err)

Expand Down Expand Up @@ -109,6 +133,11 @@ func deployCluster(ctx context.Context, t *testing.T, metalClient client.Client,
require.NoError(t, err)
}

return loadbalancer
}

// waitForClusterReady waits for cluster to become ready.
func waitForClusterReady(ctx context.Context, t *testing.T, metalClient client.Client, vmSet *vm.Set, clusterName string) *capi.Cluster {
t.Log("waiting for the cluster to be provisioned")

require.NoError(t, retry.Constant(10*time.Minute, retry.WithUnits(10*time.Second), retry.WithErrorLogging(true)).Retry(func() error {
Expand All @@ -122,7 +151,16 @@ func deployCluster(ctx context.Context, t *testing.T, metalClient client.Client,

require.NoError(t, deployedCluster.Health(ctx))

return loadbalancer, deployedCluster
return deployedCluster
}

func deployCluster(ctx context.Context, t *testing.T, metalClient client.Client, capiCluster talos.Cluster, vmSet *vm.Set,
capiManager *capi.Manager, clusterName, serverClassName string, loadbalancerPort int, controlPlaneNodes, workerNodes int64, talosVersion, kubernetesVersion string, options ...clusterOption) *loadbalancer.ControlPlane {
loadbalancer := createCluster(ctx, t, metalClient, capiCluster, vmSet, capiManager, clusterName, serverClassName, loadbalancerPort, controlPlaneNodes, workerNodes, talosVersion, kubernetesVersion, options...)

waitForClusterReady(ctx, t, metalClient, vmSet, clusterName)

return loadbalancer
}

func deleteCluster(ctx context.Context, t *testing.T, metalClient client.Client, clusterName string) {
Expand Down
113 changes: 113 additions & 0 deletions sfyra/pkg/tests/compatibility.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package tests

import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"time"

v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/stretchr/testify/require"
"github.com/talos-systems/go-procfs/procfs"
"github.com/talos-systems/go-retry/retry"
"github.com/talos-systems/talos/pkg/machinery/kernel"

"github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1"
"github.com/talos-systems/sidero/sfyra/pkg/capi"
"github.com/talos-systems/sidero/sfyra/pkg/constants"
"github.com/talos-systems/sidero/sfyra/pkg/talos"
"github.com/talos-systems/sidero/sfyra/pkg/vm"
)

const (
compatibilityClusterName = "compatibility-cluster"
compatibilityClusterLBPort = 10003
)

// TestCompatibilityCluster deploys the compatibility cluster via CAPI.
func TestCompatibilityCluster(ctx context.Context, metalClient client.Client, cluster talos.Cluster, vmSet *vm.Set, capiManager *capi.Manager, talosRelease, kubernetesVersion string) TestFunc {
return func(t *testing.T) {
if talosRelease == "" {
t.Skip("--prev-talos-release is not set, skipped compatibility check")
}

var environment v1alpha1.Environment

envName := fmt.Sprintf("talos-%s", strings.ReplaceAll(talosRelease, ".", "-"))

if err := metalClient.Get(ctx, types.NamespacedName{Name: envName}, &environment); err != nil {
if !apierrors.IsNotFound(err) {
require.NoError(t, err)
}

cmdline := procfs.NewCmdline("")
cmdline.SetAll(kernel.DefaultArgs)

cmdline.Append("console", "ttyS0")
cmdline.Append("talos.platform", "metal")

environment.APIVersion = constants.SideroAPIVersion
environment.Name = envName
environment.Spec.Kernel.URL = fmt.Sprintf("https://github.com/talos-systems/talos/releases/download/%s/vmlinuz-amd64", talosRelease)
environment.Spec.Kernel.SHA512 = ""
environment.Spec.Kernel.Args = cmdline.Strings()
environment.Spec.Initrd.URL = fmt.Sprintf("https://github.com/talos-systems/talos/releases/download/%s/initramfs-amd64.xz", talosRelease)
environment.Spec.Initrd.SHA512 = ""

require.NoError(t, metalClient.Create(ctx, &environment))
}

// wait for the environment to report ready
require.NoError(t, retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).Retry(func() error {
if err := metalClient.Get(ctx, types.NamespacedName{Name: envName}, &environment); err != nil {
return err
}

if !isEnvironmentReady(&environment) {
return retry.ExpectedErrorf("some assets are not ready")
}

return nil
}))

serverClassName := envName
classSpec := v1alpha1.ServerClassSpec{
Qualifiers: v1alpha1.Qualifiers{
CPU: []v1alpha1.CPUInformation{
{
Manufacturer: "QEMU",
},
},
},
EnvironmentRef: &v1.ObjectReference{
Name: envName,
},
}

_, err := createServerClass(ctx, metalClient, serverClassName, classSpec)
require.NoError(t, err)

ex, err := os.Executable()
require.NoError(t, err)

exPath := filepath.Dir(ex)

loadbalancer := deployCluster(ctx, t, metalClient, cluster, vmSet, capiManager, compatibilityClusterName, serverClassName, compatibilityClusterLBPort, 1, 0, talosRelease, kubernetesVersion,
withConfigURL(fmt.Sprintf("file://%s/../templates/cluster-template-talos-%s.yaml", exPath, talosRelease)),
)

deleteCluster(ctx, t, metalClient, compatibilityClusterName)
loadbalancer.Close() //nolint:errcheck
}
}
Loading

0 comments on commit b30fbe4

Please sign in to comment.