Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
36224: roachtest: after test, fail (or complain) if any CRDB nodes are dead r=andreimatei a=tbg

Do what's in the title. This will help us ensure that no test leaves dead
nodes behind (maybe some tests do that today, perhaps some chaos ones, but
it'll be easy enough to fix). This gives us confidence that when a test
passes, none of the nodes died. Additionally, for tests that fail, it will
always be obvious that a node died because it is now printed as part of the
test output including what gets posted into a Github issue.

Example:

I changed the test below so that it would start nodes 1-3, immediately stop
node 1, and return, resulting in the following test failure:

```
--- FAIL: tpcc/nodes=3/w=max (7.24s)
cluster.go:956,context.go:90,cluster.go:942,test.go:1160,test.go:1219: dead
node detection: 4: skipped
	1: dead
	2: 83220
	3: 83229
	Error:  1: dead
```

Release note: None

36279: exec: minor refactor of test utilities to reduce confusion r=yuzefovich a=yuzefovich

Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
Co-authored-by: Yahor Yuzefovich <[email protected]>
  • Loading branch information
3 people committed Mar 28, 2019
3 parents 2780c2d + 875e7ff + b8746f5 commit 5f47746
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 57 deletions.
96 changes: 79 additions & 17 deletions pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"strings"
"sync"
"syscall"
"text/template"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
Expand Down Expand Up @@ -263,51 +264,97 @@ fi
}
}

// NodeMonitorInfo TODO(peter): document
// NodeMonitorInfo is a message describing a cockroach process' status.
type NodeMonitorInfo struct {
// The index of the node (in a SyncedCluster) at which the message originated.
Index int
Msg string
// A message about the node. This is either a PID, "dead", "nc exited", or
// "skipped".
// Anything but a PID or "skipped" is an indication that there is some
// problem with the node and that the process is not running.
Msg string
// Err is an error that may occur when trying to probe the status of the node.
// If Err is non-nil, Msg is empty. After an error is returned, the node with
// the given index will no longer be probed. Errors typically indicate networking
// issues or nodes that have (physically) shut down.
Err error
}

// Monitor TODO(peter): document
func (c *SyncedCluster) Monitor() chan NodeMonitorInfo {
// Monitor writes NodeMonitorInfo for the cluster nodes to the returned channel.
// Infos sent to the channel always have the Index and exactly one of Msg or Err
// set.
//
// If oneShot is true, infos are retrieved only once for each node and the
// channel is subsequently closed; otherwise the process continues indefinitely
// (emitting new information as the status of the cockroach process changes).
//
// If ignoreEmptyNodes is true, nodes on which no CockroachDB data is found
// (in {store-dir}) will not be probed and single message, "skipped", will
// be emitted for them.
func (c *SyncedCluster) Monitor(ignoreEmptyNodes bool, oneShot bool) chan NodeMonitorInfo {
ch := make(chan NodeMonitorInfo)
nodes := c.ServerNodes()
var wg sync.WaitGroup

for i := range nodes {
wg.Add(2)
go func(i int) {
defer wg.Done()
sess, err := c.newSession(nodes[i])
if err != nil {
ch <- NodeMonitorInfo{nodes[i], err.Error()}
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
wg.Done()
return
}
defer sess.Close()

p, err := sess.StdoutPipe()
if err != nil {
ch <- NodeMonitorInfo{nodes[i], err.Error()}
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
wg.Done()
return
}

go func(p io.Reader) {
defer wg.Done()
r := bufio.NewReader(p)
for {
line, _, err := r.ReadLine()
if err == io.EOF {
return
}
ch <- NodeMonitorInfo{nodes[i], string(line)}
ch <- NodeMonitorInfo{Index: nodes[i], Msg: string(line)}
}
}(p)

// On each monitored node, we loop looking for a cockroach process. In
// order to avoid polling with lsof, if we find a live process we use nc
// (netcat) to connect to the rpc port which will block until the server
// either decides to kill the connection or the process is killed.
cmd := fmt.Sprintf(`
// In one-shot we don't use nc and return after the first assessment
// of the process' health.
data := struct {
OneShot bool
IgnoreEmpty bool
Store string
Port int
}{
OneShot: oneShot,
IgnoreEmpty: ignoreEmptyNodes,
Store: Cockroach{}.NodeDir(c, nodes[i]),
Port: Cockroach{}.NodePort(c, nodes[i]),
}

snippet := `
lastpid=0
{{ if .IgnoreEmpty}}
if [ ! -f "{{.Store}}/CURRENT" ]; then
echo "skipped"
exit 0
fi
{{- end}}
while :; do
pid=$(lsof -i :%[1]d -sTCP:LISTEN | awk '!/COMMAND/ {print $2}')
pid=$(lsof -i :{{.Port}} -sTCP:LISTEN | awk '!/COMMAND/ {print $2}')
if [ "${pid}" != "${lastpid}" ]; then
if [ -n "${lastpid}" -a -z "${pid}" ]; then
echo dead
Expand All @@ -317,21 +364,29 @@ while :; do
echo ${pid}
fi
fi
{{if .OneShot }}
exit 0
{{- end}}
if [ -n "${lastpid}" ]; then
nc localhost %[1]d >/dev/null 2>&1
nc localhost {{.Port}} >/dev/null 2>&1
echo nc exited
else
sleep 1
fi
done
`,
Cockroach{}.NodePort(c, nodes[i]))
`

t := template.Must(template.New("script").Parse(snippet))
var buf bytes.Buffer
if err := t.Execute(&buf, data); err != nil {
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}

// Request a PTY so that the script will receive will receive a SIGPIPE
// when the session is closed.
if err := sess.RequestPty(); err != nil {
ch <- NodeMonitorInfo{nodes[i], err.Error()}
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}
// Give the session a valid stdin pipe so that nc won't exit immediately.
Expand All @@ -340,16 +395,20 @@ done
// when the roachprod process is killed.
inPipe, err := sess.StdinPipe()
if err != nil {
ch <- NodeMonitorInfo{nodes[i], err.Error()}
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}
defer inPipe.Close()
if err := sess.Run(cmd); err != nil {
ch <- NodeMonitorInfo{nodes[i], err.Error()}
if err := sess.Run(buf.String()); err != nil {
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}
}(i)
}
go func() {
wg.Wait()
close(ch)
}()

return ch
}
Expand Down Expand Up @@ -684,6 +743,9 @@ const progressTodo = "----------------------------------------"

func formatProgress(p float64) string {
i := int(math.Ceil(float64(len(progressDone)) * (1 - p)))
if i > len(progressDone) {
i = len(progressDone)
}
return fmt.Sprintf("[%s%s] %.0f%%", progressDone[i:], progressTodo[:i], 100*p)
}

Expand Down
34 changes: 31 additions & 3 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ var (
logsTo time.Time
logsInterval time.Duration

monitorIgnoreEmptyNodes bool
monitorOneShot bool

cachedHostsCluster string
)

Expand Down Expand Up @@ -965,10 +968,21 @@ of nodes, outputting a line whenever a change is detected:
if err != nil {
return err
}
for i := range c.Monitor() {
fmt.Printf("%d: %s\n", i.Index, i.Msg)
var lastErr error
for msg := range c.Monitor(monitorIgnoreEmptyNodes, monitorOneShot) {
if msg.Err != nil {
lastErr = errors.Wrapf(err, "%d", msg.Index)
fmt.Printf("%d: error: %s\n", msg.Index, msg.Err)
continue
}
if msg.Msg != "" {
fmt.Printf("%d: %s\n", msg.Index, msg.Msg)
}
if strings.Contains(msg.Msg, "dead") {
lastErr = errors.Wrapf(errors.New(msg.Msg), "%d", msg.Index)
}
}
return nil
return lastErr
}),
}

Expand Down Expand Up @@ -1554,6 +1568,20 @@ func main() {
logsCmd.Flags().StringVar(
&logsDir, "logs-dir", "logs", "path to the logs dir, if remote, relative to username's home dir, ignored if local")

monitorCmd.Flags().BoolVar(
&monitorIgnoreEmptyNodes,
"ignore-empty-nodes",
false,
"Automatically detect the (subset of the given) nodes which to monitor "+
"based on the presence of a nontrivial data directory.")

monitorCmd.Flags().BoolVar(
&monitorOneShot,
"oneshot",
false,
"Report the status of all targeted nodes once, then exit. The exit "+
"status is nonzero if (and only if) any node was found not running.")

cachedHostsCmd.Flags().StringVar(&cachedHostsCluster, "cluster", "", "print hosts matching cluster")

for _, cmd := range []*cobra.Command{
Expand Down
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/chaos.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error {

select {
case <-ch.Stopper:
// NB: the roachtest harness checks that at the end of the test,
// all nodes that have data also have a running process.
l.Printf("restarting %v (chaos is done)\n", target)
c.Start(ctx, c.t.(*test), target)
return nil
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,7 @@ func runCLINodeStatus(ctx context.Context, t *test, c *cluster) {
"true true",
"false false",
})

// Start node again to satisfy roachtest.
c.Start(ctx, t, c.Node(3))
}
26 changes: 26 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,32 @@ func (c *cluster) FetchDebugZip(ctx context.Context) error {
})
}

// FailOnDeadNodes fails the test if nodes that have a populated data dir are
// found to be not running. It prints both to t.l and the test output.
func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) {
if c.nodes == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return
}

// Don't hang forever.
_ = contextutil.RunWithTimeout(ctx, "detect dead nodes", time.Minute, func(ctx context.Context) error {
_, err := execCmdWithBuffer(
ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes",
)
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if err != nil {
if ctx.Err() != nil {
// Don't fail if we timed out.
return nil
}
t.Fatalf("dead node detection: %s", err)
}
return nil
})
}

// FetchDmesg grabs the dmesg logs if possible. This requires being able to run
// `sudo dmesg` on the remote nodes.
func (c *cluster) FetchDmesg(ctx context.Context) error {
Expand Down
5 changes: 5 additions & 0 deletions pkg/cmd/roachtest/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,11 @@ SELECT count(replicas)
}

g.checkConnectedAndFunctional(ctx, t, c)

// Stop our special snowflake process which won't be recognized by the test
// harness, and start it again on the regular.
c.Stop(ctx, c.Node(1))
c.Start(ctx, t, c.Node(1))
}

func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,10 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {

t.l.Printf("%d OK\n", j)
}

// Clean up for the test harness. Usually we want to leave nodes running so
// that consistency checks can be run, but in this case there's not much
// there in the first place anyway.
c.Stop(ctx, nodes)
c.Wipe(ctx, nodes)
}
Loading

0 comments on commit 5f47746

Please sign in to comment.