From a3813d4fd2bc5c914aaf66aa9d40fc9ff9011afa Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Mon, 27 Apr 2020 23:52:52 -0400 Subject: [PATCH] e2e tests should wait for cluster readiness Particularly with etcd2, we're seeing some flakes when run in github actions (which I assume is generally less powerful than some of the other configurations). The most recent error was the node not having a leader, so (briefly) wait for that along with node health before starting the test. --- pkg/etcdclient/interfaces.go | 4 +++ pkg/etcdclient/v2.go | 11 +++++++ pkg/etcdclient/v3.go | 36 ++++++++++++++++------- test/integration/harness/cluster.go | 6 ++++ test/integration/harness/node.go | 19 ++++++++++++ test/integration/upgradedowngrade_test.go | 9 +++--- 6 files changed, 71 insertions(+), 14 deletions(-) diff --git a/pkg/etcdclient/interfaces.go b/pkg/etcdclient/interfaces.go index 09981a3b..e6b8cb16 100644 --- a/pkg/etcdclient/interfaces.go +++ b/pkg/etcdclient/interfaces.go @@ -27,6 +27,10 @@ type EtcdClient interface { // CopyTo traverses every key and writes it to dest CopyTo(ctx context.Context, dest NodeSink) (int, error) + // LeaderID returns the ID of the current leader, or "" if there is no leader + // NOTE: This is currently only used in end-to-end tests + LeaderID(ctx context.Context) (string, error) + ListMembers(ctx context.Context) ([]*EtcdProcessMember, error) AddMember(ctx context.Context, peerURLs []string) error RemoveMember(ctx context.Context, member *EtcdProcessMember) error diff --git a/pkg/etcdclient/v2.go b/pkg/etcdclient/v2.go index cbcd3964..06c5d15f 100644 --- a/pkg/etcdclient/v2.go +++ b/pkg/etcdclient/v2.go @@ -130,6 +130,17 @@ func (c *V2Client) ListMembers(ctx context.Context) ([]*EtcdProcessMember, error return members, nil } +func (c *V2Client) LeaderID(ctx context.Context) (string, error) { + leader, err := c.members.Leader(ctx) + if err != nil { + return "", err + } + if leader == nil { + return "", nil + } + return leader.ID, nil +} + func (c *V2Client) AddMember(ctx context.Context, peerURLs []string) error { if len(peerURLs) == 0 { return fmt.Errorf("AddMember with empty peerURLs") diff --git a/pkg/etcdclient/v3.go b/pkg/etcdclient/v3.go index 88134feb..b349acda 100644 --- a/pkg/etcdclient/v3.go +++ b/pkg/etcdclient/v3.go @@ -20,11 +20,12 @@ import ( ) type V3Client struct { - endpoints []string - client *etcd_client_v3.Client - kv etcd_client_v3.KV - cluster etcd_client_v3.Cluster - tlsConfig *tls.Config + endpoints []string + client *etcd_client_v3.Client + kv etcd_client_v3.KV + cluster etcd_client_v3.Cluster + maintenance etcd_client_v3.Maintenance + tlsConfig *tls.Config } var _ EtcdClient = &V3Client{} @@ -46,12 +47,14 @@ func NewV3Client(endpoints []string, tlsConfig *tls.Config) (EtcdClient, error) } kv := etcd_client_v3.NewKV(etcdClient) + maintenance := etcd_client_v3.NewMaintenance(etcdClient) return &V3Client{ - endpoints: endpoints, - client: etcdClient, - kv: kv, - cluster: etcd_client_v3.NewCluster(etcdClient), - tlsConfig: tlsConfig, + endpoints: endpoints, + client: etcdClient, + kv: kv, + maintenance: maintenance, + cluster: etcd_client_v3.NewCluster(etcdClient), + tlsConfig: tlsConfig, }, nil } @@ -204,6 +207,19 @@ func (c *V3Client) ListMembers(ctx context.Context) ([]*EtcdProcessMember, error return members, nil } +func (c *V3Client) LeaderID(ctx context.Context) (string, error) { + response, err := c.maintenance.Status(ctx, c.endpoints[0]) + if err != nil { + return "", err + } + leaderID := response.Leader + if leaderID == 0 { + return "", nil + } + + return strconv.FormatUint(leaderID, 10), nil +} + func (c *V3Client) AddMember(ctx context.Context, peerURLs []string) error { _, err := c.cluster.MemberAdd(ctx, peerURLs) return err diff --git a/test/integration/harness/cluster.go b/test/integration/harness/cluster.go index a99bf2c5..c0d74169 100644 --- a/test/integration/harness/cluster.go +++ b/test/integration/harness/cluster.go @@ -157,6 +157,12 @@ func (h *TestHarness) WaitForHealthy(nodes ...*TestHarnessNode) { } } +func (h *TestHarness) WaitForHasLeader(nodes ...*TestHarnessNode) { + for _, node := range nodes { + node.WaitForHasLeader(10 * time.Second) + } +} + func (h *TestHarness) WaitForVersion(timeout time.Duration, expectedVersion string, nodes ...*TestHarnessNode) { for _, n := range nodes { h.WaitFor(timeout, func() error { diff --git a/test/integration/harness/node.go b/test/integration/harness/node.go index 1ac12d39..f7184fa7 100644 --- a/test/integration/harness/node.go +++ b/test/integration/harness/node.go @@ -285,6 +285,25 @@ func (n *TestHarnessNode) WaitForHealthy(timeout time.Duration) { }) } +func (n *TestHarnessNode) WaitForHasLeader(timeout time.Duration) { + n.TestHarness.WaitFor(timeout, func() error { + client, err := n.NewClient() + if err != nil { + return fmt.Errorf("error building etcd client: %v", err) + } + defer client.Close() + + leaderID, err := client.LeaderID(context.Background()) + if err != nil { + return err + } + if leaderID == "" { + return fmt.Errorf("node did not have leader") + } + return nil + }) +} + type MockDNSProvider struct { } diff --git a/test/integration/upgradedowngrade_test.go b/test/integration/upgradedowngrade_test.go index 269b2c39..355e7d9b 100644 --- a/test/integration/upgradedowngrade_test.go +++ b/test/integration/upgradedowngrade_test.go @@ -44,17 +44,18 @@ func TestUpgradeDowngrade(t *testing.T) { { n1.WaitForListMembers(60 * time.Second) h.WaitForHealthy(n1, n2, n3) + h.WaitForHasLeader(n1, n2, n3) members1, err := n1.ListMembers(ctx) if err != nil { - t.Errorf("error doing etcd ListMembers: %v", err) + t.Errorf("error doing etcd ListMembers (before upgrade): %v", err) } else if len(members1) != 3 { - t.Errorf("members was not as expected: %v", members1) + t.Errorf("members was not as expected (before upgrade): %v", members1) } else { klog.Infof("got members from #1: %v", members1) } if err := n1.Put(ctx, testKey, "worldv2"); err != nil { - t.Fatalf("unable to set test key: %v", err) + t.Fatalf("unable to set test key before upgrade: %v", err) } n1.AssertVersion(t, fromVersion) @@ -98,7 +99,7 @@ func TestUpgradeDowngrade(t *testing.T) { } if err := n1.Put(ctx, testKey, "worldv3"); err != nil { - t.Fatalf("unable to set test key: %v", err) + t.Fatalf("unable to set test key after upgrade: %v", err) } n1.AssertVersion(t, toVersion)