Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
51870: kvserver: improve node liveness failure log, point to docs r=lunevalex a=nvanbenschoten

This commit improves the message logged on node liveness failures, which
is logged roughly every 4.5 seconds during liveness unavailability. The
improved message describes some of the common causes of liveness
unavailability (resource starvation and network connectivity problems)
and then links to our troubleshooting docs about the topic.

This was an action item coming out of a recent support incident postmortem.

51907: roachtest: deflake decommission/nodes=4 test r=irfansharif a=irfansharif

Fixes #51713. Previously this test used to decommission+wipe node 1,
which played a bit badly with #51329, where node 1 is tasked to
initialize the cluster (and no-ops attempts on finding a persisted
file). Given that previously this test wiped away the store dir in its
entirety, this test attempted to re-initialize the cluster and naturally
failed to do so. As a short workaround we just decommission+wipe nodes
towards the end of the cluster list (so node 4), instead of the
beginning. And while here, clean the test code up a bit.

Release note: None.

51945: roachprod: lose sudo in roachprod stage r=ajwerner a=ajwerner

We shouldn't need it.

Release note: None

Co-authored-by: Nathan VanBenschoten <[email protected]>
Co-authored-by: irfan sharif <[email protected]>
Co-authored-by: Andrew Werner <[email protected]>
  • Loading branch information
4 people committed Jul 28, 2020
4 parents 3a1b423 + 09076fd + 30aaf6d + a2aa5bb commit 4c24889
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 60 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/roachprod/install/staging.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func StageOptionalRemoteLibrary(
fmt.Printf("Resolved library url for %s: %s\n", libraryName, url)
cmdStr := fmt.Sprintf(
`mkdir -p ./lib && \
sudo curl -sfSL -o "./lib/%s" "%s" 2>/dev/null || echo 'optional library %s not found; continuing...'`,
curl -sfSL -o "./lib/%s" "%s" 2>/dev/null || echo 'optional library %s not found; continuing...'`,
libraryName+ext,
url,
libraryName+ext,
Expand Down Expand Up @@ -121,8 +121,8 @@ func StageCockroachRelease(c *SyncedCluster, version, arch string) error {
tmpdir="$(mktemp -d /tmp/cockroach-release.XXX)" && \
curl -f -s -S -o- %s | tar xfz - -C "${tmpdir}" --strip-components 1 && \
mv ${tmpdir}/cockroach ./cockroach && \
sudo mkdir -p ./lib && \
if [ -d ${tmpdir}/lib ]; then sudo mv ${tmpdir}/lib/* ./lib; fi && \
mkdir -p ./lib && \
if [ -d ${tmpdir}/lib ]; then mv ${tmpdir}/lib/* ./lib; fi && \
chmod 755 ./cockroach
`, binURL)
return c.Run(
Expand Down
126 changes: 70 additions & 56 deletions pkg/cmd/roachtest/decommission.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@ func registerDecommission(r *testRegistry) {
duration := time.Hour

r.Add(testSpec{
Name: fmt.Sprintf("decommission/nodes=%d/duration=%s", numNodes, duration),
Owner: OwnerKV,
Cluster: makeClusterSpec(4),
Name: fmt.Sprintf("decommission/nodes=%d/duration=%s", numNodes, duration),
Owner: OwnerKV,
MinVersion: "v20.2.0",
Cluster: makeClusterSpec(4),
Run: func(ctx context.Context, t *test, c *cluster) {
if local {
duration = 3 * time.Minute
duration = 5 * time.Minute
t.l.Printf("running with duration=%s in local mode\n", duration)
}
runDecommission(t, c, numNodes, duration)
runDecommission(ctx, t, c, numNodes, duration)
},
})
}
Expand All @@ -63,14 +64,20 @@ func registerDecommission(r *testRegistry) {
}
}

// runDecommission decommissions and wipes nodes in a cluster repeatedly,
// alternating between the node being shut down gracefully before and after the
// decommissioning operation, while some light load is running against the
// cluster (to manually verify that the qps don't dip too much).
//
// TODO(tschottdorf): verify that the logs don't contain the messages
// that would spam the log before #23605. I wonder if we should really
// start grepping the logs. An alternative is to introduce a metric
// that would have signaled this and check that instead.
func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
ctx := context.Background()

func runDecommission(ctx context.Context, t *test, c *cluster, nodes int, duration time.Duration) {
const defaultReplicationFactor = 3
if defaultReplicationFactor > nodes {
t.Fatal("improper configuration: replication factor greater than number of nodes in the test")
}
// The number of nodes we're going to cycle through. Since we're sometimes
// killing the nodes and then removing them, this means having to be careful
// with loss of quorum. So only ever touch a fixed minority of nodes and
Expand All @@ -79,18 +86,19 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
// at some point.
numDecom := (defaultReplicationFactor - 1) / 2

c.Put(ctx, workload, "./workload", c.Node(nodes))
// node1 is kept pinned (i.e. not decommissioned/restarted), and is the node
// through which we run the workload and other queries.
pinnedNode := 1
c.Put(ctx, cockroach, "./cockroach", c.All())
c.Put(ctx, workload, "./workload", c.Node(pinnedNode))

for i := 1; i <= numDecom; i++ {
for i := 1; i <= nodes; i++ {
c.Start(ctx, t, c.Node(i), startArgs(fmt.Sprintf("-a=--attrs=node%d", i)))
}
c.Run(ctx, c.Node(pinnedNode), `./workload init kv --drop`)

c.Start(ctx, t, c.Range(numDecom+1, nodes))
c.Run(ctx, c.Node(nodes), `./workload init kv --drop`)

waitReplicatedAwayFrom := func(downNodeID string) error {
db := c.Conn(ctx, nodes)
waitReplicatedAwayFrom := func(downNodeID int) error {
db := c.Conn(ctx, pinnedNode)
defer func() {
_ = db.Close()
}()
Expand Down Expand Up @@ -121,52 +129,47 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
return nil
}

waitUpReplicated := func(targetNodeID string) error {
db := c.Conn(ctx, nodes)
waitUpReplicated := func(targetNode, targetNodeID int) error {
db := c.Conn(ctx, pinnedNode)
defer func() {
_ = db.Close()
}()

for ok := false; !ok; {
var count int
for {
// Check to see that there are no ranges where the target node is
// not part of the replica set.
stmtReplicaCount := fmt.Sprintf(
`SELECT count(*) = 0 FROM crdb_internal.ranges WHERE array_position(replicas, %s) IS NULL and database_name = 'kv';`, targetNodeID)
t.Status(stmtReplicaCount)
if err := db.QueryRow(stmtReplicaCount).Scan(&ok); err != nil {
`SELECT count(*) FROM crdb_internal.ranges WHERE array_position(replicas, %d) IS NULL and database_name = 'kv';`, targetNodeID)
if err := db.QueryRow(stmtReplicaCount).Scan(&count); err != nil {
return err
}
t.Status(fmt.Sprintf("node%d missing %d replica(s)", targetNode, count))
if count == 0 {
break
}
time.Sleep(time.Second)
}
return nil
}

if err := waitReplicatedAwayFrom("0" /* no down node */); err != nil {
if err := waitReplicatedAwayFrom(0 /* no down node */); err != nil {
t.Fatal(err)
}

loadDuration := " --duration=" + duration.String()

workloads := []string{
// TODO(tschottdorf): in remote mode, the ui shows that we consistently write
// at 330 qps (despite asking for 500 below). Locally we get 500qps (and a lot
// more without rate limiting). Check what's up with that.
"./workload run kv --max-rate 500 --tolerate-errors" + loadDuration + " {pgurl:1-%d}",
fmt.Sprintf("./workload run kv --max-rate 500 --tolerate-errors --duration=%s {pgurl:1-%d}", duration.String(), nodes),
}

run := func(stmtStr string) {
db := c.Conn(ctx, nodes)
run := func(stmt string) {
db := c.Conn(ctx, pinnedNode)
defer db.Close()
stmt := fmt.Sprintf(stmtStr, "", "=")
// We are removing the EXPERIMENTAL keyword in 2.1. For compatibility
// with 2.0 clusters we still need to try with it if the
// syntax without EXPERIMENTAL fails.
// TODO(knz): Remove this in 2.2.

t.Status(stmt)
_, err := db.ExecContext(ctx, stmt)
if err != nil && strings.Contains(err.Error(), "syntax error") {
stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "")
t.Status(stmt)
_, err = db.ExecContext(ctx, stmt)
}
if err != nil {
t.Fatal(err)
}
Expand All @@ -177,20 +180,19 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
m, ctx = errgroup.WithContext(ctx)
for _, cmd := range workloads {
cmd := cmd // copy is important for goroutine

cmd = fmt.Sprintf(cmd, nodes)
m.Go(func() error {
return c.RunE(ctx, c.Node(nodes), cmd)
return c.RunE(ctx, c.Node(pinnedNode), cmd)
})
}

m.Go(func() error {
nodeID := func(node int) (string, error) {
getNodeID := func(node int) (int, error) {
dbNode := c.Conn(ctx, node)
defer dbNode.Close()
var nodeID string

var nodeID int
if err := dbNode.QueryRow(`SELECT node_id FROM crdb_internal.node_runtime_info LIMIT 1`).Scan(&nodeID); err != nil {
return "", err
return 0, err
}
return nodeID, nil
}
Expand All @@ -201,21 +203,32 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
return c.RunE(ctx, c.Node(node), "./cockroach quit --insecure --host=:"+port)
}

decom := func(id string) error {
port := fmt.Sprintf("{pgport:%d}", nodes) // always use last node
t.Status("decommissioning node", id)
return c.RunE(ctx, c.Node(nodes), "./cockroach node decommission --insecure --wait=all --host=:"+port+" "+id)
decom := func(id int) error {
port := fmt.Sprintf("{pgport:%d}", pinnedNode) // always use the pinned node
t.Status(fmt.Sprintf("decommissioning node %d", id))
return c.RunE(ctx, c.Node(pinnedNode), fmt.Sprintf("./cockroach node decommission --insecure --wait=all --host=:%s %d", port, id))
}

for tBegin, whileDown, node := timeutil.Now(), true, 1; timeutil.Since(tBegin) <= duration; whileDown, node = !whileDown, (node%numDecom)+1 {
tBegin, whileDown := timeutil.Now(), true
node := nodes
for timeutil.Since(tBegin) <= duration {
// Alternate between the node being shut down gracefully before and
// after the decommissioning operation.
whileDown = !whileDown
// Cycle through the last numDecom nodes.
node = nodes - (node % numDecom)
if node == pinnedNode {
t.Fatalf("programming error: not expecting to decommission/wipe node%d", pinnedNode)
}

t.Status(fmt.Sprintf("decommissioning %d (down=%t)", node, whileDown))
id, err := nodeID(node)
nodeID, err := getNodeID(node)
if err != nil {
return err
}
run(fmt.Sprintf(`ALTER RANGE default %%[1]s CONFIGURE ZONE %%[2]s 'constraints: {"+node%d"}'`, node))

if err := waitUpReplicated(id); err != nil {
run(fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE = 'constraints: {"+node%d"}'`, node))
if err := waitUpReplicated(node, nodeID); err != nil {
return err
}

Expand All @@ -225,13 +238,13 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
}
}

run(fmt.Sprintf(`ALTER RANGE default %%[1]s CONFIGURE ZONE %%[2]s 'constraints: {"-node%d"}'`, node))
run(fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE = 'constraints: {"-node%d"}'`, node))

if err := decom(id); err != nil {
if err := decom(nodeID); err != nil {
return err
}

if err := waitReplicatedAwayFrom(id); err != nil {
if err := waitReplicatedAwayFrom(nodeID); err != nil {
return err
}

Expand All @@ -241,14 +254,15 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) {
}
}

// Wipe the node and re-add to cluster with a new node ID.
if err := c.RunE(ctx, c.Node(node), "rm -rf {store-dir}"); err != nil {
return err
}

db := c.Conn(ctx, 1)
db := c.Conn(ctx, pinnedNode)
defer db.Close()

sArgs := startArgs(fmt.Sprintf("-a=--join %s --attrs=node%d", c.InternalAddr(ctx, c.Node(nodes))[0], node))
sArgs := startArgs(fmt.Sprintf("-a=--join %s --attrs=node%d", c.InternalAddr(ctx, c.Node(pinnedNode))[0], node))
if err := c.StartE(ctx, c.Node(node), sArgs); err != nil {
return err
}
Expand Down
12 changes: 11 additions & 1 deletion pkg/kv/kvserver/node_liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ func (nl *NodeLiveness) StartHeartbeat(
}
return nil
}); err != nil {
log.Warningf(ctx, "failed node liveness heartbeat: %+v", err)
log.Warningf(ctx, heartbeatFailureLogFormat, err)
}

nl.heartbeatToken <- struct{}{}
Expand All @@ -602,6 +602,16 @@ func (nl *NodeLiveness) StartHeartbeat(
})
}

const heartbeatFailureLogFormat = `failed node liveness heartbeat: %+v
An inability to maintain liveness will prevent a node from participating in a
cluster. If this problem persists, it may be a sign of resource starvation or
of network connectivity problems. For help troubleshooting, visit:
https://www.cockroachlabs.com/docs/stable/cluster-setup-troubleshooting.html#node-liveness-issues
`

// PauseHeartbeat stops or restarts the periodic heartbeat depending on the
// pause parameter. When pause is true, waits until it acquires the heartbeatToken
// (unless heartbeat was already paused); this ensures that no heartbeats happen
Expand Down

0 comments on commit 4c24889

Please sign in to comment.