Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
102636: roachtest: report health status during teardown r=herkolategan,renatolabs a=smg260

roachtest: report health status of all nodes in teardown

In cockroachdb#102622, we changed how to determine if a node was suitable to connect to for post test validations, to first check a node's `health?ready=1` endpoint.  

This extends that PR by extracting the `http://node:port/health?ready=1` functionality, and using it to report on each node during teardown by logging the `status` and/or `body` or `error`. Finding a live node piggy backs of this information to create a connection to any node with `200` status.

This could be useful to add to `test_interface`

Epic: none
Release note: none


Co-authored-by: Miral Gadani <[email protected]>
  • Loading branch information
craig[bot] and Miral Gadani committed May 17, 2023
2 parents bba88ee + 67e1af7 commit b02f647
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 31 deletions.
74 changes: 45 additions & 29 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"io"
"io/fs"
"net"
"net/http"
"net/url"
"os"
"os/exec"
Expand Down Expand Up @@ -1369,45 +1368,62 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
return nil
}

// ConnectToLiveNode returns a connection to a live node in the cluster. If no
// live node is found, it returns nil and -1. If a live node is found it returns
// a connection to it and the node's index.
func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) {
if c.spec.NodeCount < 1 {
return nil, -1 // unit tests
}
type HealthStatusResult struct {
Node int
Status int
Body []byte
Err error
}

checkReady := func(ctx context.Context, node int) bool {
adminAddr, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(node))
if err != nil {
t.L().Printf("n%d not ready/live: %s", node, err)
return false
}
func newHealthStatusResult(node int, status int, body []byte, err error) *HealthStatusResult {
return &HealthStatusResult{
Node: node,
Status: status,
Body: body,
Err: err,
}
}

url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddr[0])
// HealthStatus returns the result of the /health?ready=1 endpoint for each node.
func (c *clusterImpl) HealthStatus(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
) ([]*HealthStatusResult, error) {
if len(node) < 1 {
return nil, nil // unit tests
}
adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, node)
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddrs[node-1])
resp, err := httputil.Get(ctx, url)
if err != nil {
t.L().Printf("n%d not ready/live: %s", node, err)
return false
return newHealthStatusResult(node, 0, nil, err)
}

defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil || resp.StatusCode != http.StatusOK {
t.L().Printf("n%d not ready/live: HTTP %d \n%s", node, resp.StatusCode, body)
return false
}

return true
return newHealthStatusResult(node, resp.StatusCode, body, err)
}

// Find a live node to run against, if one exists.
for i := 1; i <= c.spec.NodeCount; i++ {
if checkReady(ctx, i) {
return c.Conn(ctx, t.L(), i), i
results := make([]*HealthStatusResult, c.spec.NodeCount)

_ = contextutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(c.spec.NodeCount)
for i := 1; i <= c.spec.NodeCount; i++ {
go func(node int) {
defer wg.Done()
results[node-1] = getStatus(ctx, node)
}(i)
}
}
return nil, -1
wg.Wait()
return nil
})

return results, nil
}

// FailOnInvalidDescriptors fails the test if there exists any descriptors in
Expand Down Expand Up @@ -2304,7 +2320,7 @@ func (c *clusterImpl) InternalAdminUIAddr(
return addrs, nil
}

// ExternalAdminUIAddr returns the internal Admin UI address in the form host:port
// ExternalAdminUIAddr returns the external Admin UI address in the form host:port
// for the specified node.
func (c *clusterImpl) ExternalAdminUIAddr(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
Expand Down
31 changes: 29 additions & 2 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package main
import (
"archive/zip"
"context"
gosql "database/sql"
"fmt"
"html"
"io"
Expand Down Expand Up @@ -1111,6 +1112,33 @@ func (r *testRunner) teardownTest(
}
}

// We collect all the admin health endpoints in parallel,
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
if err != nil {
t.Error(errors.WithDetail(err, "Unable to check health status"))
}

var db *gosql.DB
var validationNode int
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
continue
}

if s.Status != http.StatusOK {
t.L().Printf("n%d:/health?ready=1 status=%d body=%s", s.Node, s.Status, s.Body)
continue
}

if db == nil {
db = c.Conn(ctx, t.L(), s.Node)
validationNode = s.Node
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
}

// We avoid trying to do this when t.Failed() (and in particular when there
// are dead nodes) because for reasons @tbg does not understand this gets
// stuck occasionally, which really ruins the roachtest run. The method
Expand All @@ -1119,10 +1147,9 @@ func (r *testRunner) teardownTest(
//
// TODO(testinfra): figure out why this can still get stuck despite the
// above.
db, node := c.ConnectToLiveNode(ctx, t)
if db != nil {
defer db.Close()
t.L().Printf("running validation checks on node %d (<10m)", node)
t.L().Printf("running validation checks on node %d (<10m)", validationNode)
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
Expand Down

0 comments on commit b02f647

Please sign in to comment.