Skip to content

Commit

Permalink
test: implement scale up and down tests and fix found issues
Browse files Browse the repository at this point in the history
Fixes:
- Check etcd health on all nodes before scaling down.
- Check all nodes are booted before scaling down.
- Do not call `Shutdown` during machine deletion (only for Talos >= 0.12.2).

Scale down tests are disabled for Talos < 0.12.2.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Sep 16, 2021
1 parent 9435b12 commit 6ad6aac
Show file tree
Hide file tree
Showing 7 changed files with 338 additions and 65 deletions.
87 changes: 49 additions & 38 deletions controllers/configs.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,55 +50,66 @@ func (r *TalosControlPlaneReconciler) kubeconfigForCluster(ctx context.Context,
}

// talosconfigForMachine will generate a talosconfig that uses *all* found addresses as the endpoints.
func (r *TalosControlPlaneReconciler) talosconfigForMachine(ctx context.Context, clientset *kubernetes.Clientset, machine capiv1.Machine) (*talosclient.Client, error) {
if machine.Status.NodeRef == nil {
return nil, fmt.Errorf("%q machine does not have a nodeRef", machine.Name)
}

// grab all addresses as endpoints
node, err := clientset.CoreV1().Nodes().Get(ctx, machine.Status.NodeRef.Name, metav1.GetOptions{})
if err != nil {
return nil, err
func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context, clientset *kubernetes.Clientset, machines ...capiv1.Machine) (*talosclient.Client, error) {
if len(machines) == 0 {
return nil, fmt.Errorf("at least one machine should be provided")
}

addrList := []string{}
for _, addr := range node.Status.Addresses {
if addr.Type == corev1.NodeExternalIP || addr.Type == corev1.NodeInternalIP {
addrList = append(addrList, addr.Address)
}
}

if len(addrList) == 0 {
return nil, fmt.Errorf("no addresses were found for node %q", node.Name)
}
var t *talosconfig.Config

var (
cfgs cabptv1.TalosConfigList
found *cabptv1.TalosConfig
)
for _, machine := range machines {
if machine.Status.NodeRef == nil {
return nil, fmt.Errorf("%q machine does not have a nodeRef", machine.Name)
}

// find talosconfig in the machine's namespace
err = r.Client.List(ctx, &cfgs, client.InNamespace(machine.Namespace))
if err != nil {
return nil, err
}
// grab all addresses as endpoints
node, err := clientset.CoreV1().Nodes().Get(ctx, machine.Status.NodeRef.Name, metav1.GetOptions{})
if err != nil {
return nil, err
}

for _, cfg := range cfgs.Items {
for _, ref := range cfg.OwnerReferences {
if ref.Kind == "Machine" && ref.Name == machine.Name {
found = &cfg
break
for _, addr := range node.Status.Addresses {
if addr.Type == corev1.NodeExternalIP || addr.Type == corev1.NodeInternalIP {
addrList = append(addrList, addr.Address)
}
}
}

if found == nil {
return nil, fmt.Errorf("failed to find TalosConfig for %q", machine.Name)
}
if len(addrList) == 0 {
return nil, fmt.Errorf("no addresses were found for node %q", node.Name)
}

t, err := talosconfig.FromString(found.Status.TalosConfig)
if err != nil {
return nil, err
if t == nil {
var (
cfgs cabptv1.TalosConfigList
found *cabptv1.TalosConfig
)

// find talosconfig in the machine's namespace
err = r.Client.List(ctx, &cfgs, client.InNamespace(machine.Namespace))
if err != nil {
return nil, err
}

for _, cfg := range cfgs.Items {
for _, ref := range cfg.OwnerReferences {
if ref.Kind == "Machine" && ref.Name == machine.Name {
found = &cfg
break
}
}
}

if found == nil {
return nil, fmt.Errorf("failed to find TalosConfig for %q", machine.Name)
}

t, err = talosconfig.FromString(found.Status.TalosConfig)
if err != nil {
return nil, err
}
}
}

return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
Expand Down
85 changes: 84 additions & 1 deletion controllers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,92 @@ import (
"github.com/talos-systems/talos/pkg/machinery/api/machine"
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
capiv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, cluster *capiv1.Cluster, ownedMachines []capiv1.Machine) error {
clientset, err := r.kubeconfigForCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return err
}

machines := []capiv1.Machine{}

for _, machine := range ownedMachines {
if machine.ObjectMeta.DeletionTimestamp.IsZero() {
machines = append(machines, machine)
}
}

c, err := r.talosconfigForMachines(ctx, clientset, machines...)
if err != nil {
return err
}

service := "etcd"

params := make([]interface{}, 0, len(machines)*2)
for _, machine := range machines {
params = append(params, "node", machine.Name)
}

r.Log.Info("Verifying etcd health on all nodes", params...)

svcs, err := c.ServiceInfo(ctx, service)
if err != nil {
return err
}

// check that etcd service is healthy on all nodes
for _, svc := range svcs {
node := svc.Metadata.GetHostname()

if len(svc.Service.Events.Events) == 0 {
return fmt.Errorf("%s: no events recorded yet for service %q", node, service)
}

lastEvent := svc.Service.Events.Events[len(svc.Service.Events.Events)-1]
if lastEvent.State != "Running" {
return fmt.Errorf("%s: service %q not in expected state %q: current state [%s] %s", node, service, "Running", lastEvent.State, lastEvent.Msg)
}

if !svc.Service.GetHealth().GetHealthy() {
return fmt.Errorf("%s: service is not healthy: %s", node, service)
}
}

resp, err := c.EtcdMemberList(ctx, &machine.EtcdMemberListRequest{})
if err != nil {
return err
}

members := map[string]struct{}{}

for i, message := range resp.Messages {
actualMembers := len(message.Members)
expectedMembers := len(machines)

node := message.Metadata.GetHostname()

// check that the count of members is the same on all nodes
if actualMembers != expectedMembers {
return fmt.Errorf("%s: expected to have %d members, got %d", node, expectedMembers, actualMembers)
}

// check that member list is the same on all nodes
for _, member := range message.Members {
if _, found := members[member.Hostname]; i > 0 && !found {
return fmt.Errorf("%s: found extra etcd member %s", node, member.Hostname)
}

members[member.Hostname] = struct{}{}
}
}

return nil
}

// gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership
// and issuing a "leave" request from the machine itself.
func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave capiv1.Machine) error {
Expand Down Expand Up @@ -100,7 +183,7 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
return err
}

c, err := r.talosconfigForMachine(ctx, clientset, designatedCPMachine)
c, err := r.talosconfigForMachines(ctx, clientset, designatedCPMachine)
if err != nil {
return err
}
Expand Down
Loading

0 comments on commit 6ad6aac

Please sign in to comment.