Skip to content

Commit

Permalink
roachtest: fix decommission/randomize
Browse files Browse the repository at this point in the history
The test could end up using fully decommissioned nodes for cli commands,
which does not work as of #55286.
Additionally, decommissioned nodes now become non-live after a short
while, so various cli output checks had to be adjusted.

Fixes #55581.

Release note: None
  • Loading branch information
tbg committed Oct 27, 2020
1 parent 5e3c201 commit a00ffe5
Showing 1 changed file with 130 additions and 67 deletions.
197 changes: 130 additions & 67 deletions pkg/cmd/roachtest/decommission.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,15 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
Multiplier: 2,
}

// Partially decommission then recommission a random node, from another
// Partially decommission then recommission n1, from another
// random node. Run a couple of status checks while doing so.
// We hard-code n1 to guard against the hypothetical case in which
// targetNode has no replicas (yet) to begin with, which would make
// it permanently decommissioned even with `--wait=none`.
// We can choose runNode freely (including chosing targetNode) since
// the decommission process won't succeed.
{
targetNode, runNode := h.getRandNode(), h.getRandNode()
targetNode, runNode := h.nodeIDs[0], h.getRandNode()
t.l.Printf("partially decommissioning n%d from n%d\n", targetNode, runNode)
o, err := h.decommission(ctx, c.Node(targetNode), runNode,
"--wait=none", "--format=csv")
Expand Down Expand Up @@ -379,25 +384,36 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {

// Check to see that operators aren't able to decommission into
// availability. We'll undo the attempted decommissioning event by
// recommissioning the targeted nodes.
// recommissioning the targeted nodes. Note that the decommission
// attempt will not result in any individual nodes being marked
// as decommissioned, as this would only happen if *all* targeted
// nodes report no replicas, which is impossible since we're targeting
// the whole cluster here.
{
// Attempt to decommission all the nodes.
{
runNode := h.getRandNode()
t.l.Printf("attempting to decommission all nodes from n%d\n", runNode)
o, err := h.decommission(ctx, c.All(), runNode,
"--wait=none", "--format=csv")
if err != nil {
t.Fatalf("decommission failed: %v", err)
}
// NB: the retry loop here is mostly silly, but since some of the fields below
// are async, we may for example find an 'active' node instead of 'decommissioning'.
if err := retry.WithMaxAttempts(ctx, retryOpts, 50, func() error {
runNode := h.getRandNode()
t.l.Printf("attempting to decommission all nodes from n%d\n", runNode)
o, err := h.decommission(ctx, c.All(), runNode,
"--wait=none", "--format=csv")
if err != nil {
t.Fatalf("decommission failed: %v", err)
}

exp := [][]string{decommissionHeader}
for i := 1; i <= c.spec.NodeCount; i++ {
rowRegex := []string{strconv.Itoa(i), "true", `\d+`, "true", "decommissioning", "false"}
exp = append(exp, rowRegex)
}
if err := h.matchCSV(o, exp); err != nil {
t.Fatalf("decommission failed: %v", err)
exp := [][]string{decommissionHeader}
for i := 1; i <= c.spec.NodeCount; i++ {
rowRegex := []string{strconv.Itoa(i), "true", `\d+`, "true", "decommissioning", "false"}
exp = append(exp, rowRegex)
}
if err := h.matchCSV(o, exp); err != nil {
t.Fatalf("decommission failed: %v", err)
}
return nil
}); err != nil {
t.Fatal(err)
}
}

Expand Down Expand Up @@ -468,12 +484,14 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
}
}

// Fully recommission two random nodes, from a random node, randomly choosing
// Fully decommission two random nodes, from a random node, randomly choosing
// between using --wait={all,none}. We pin these two nodes to not re-use
// them for the block after, as they will have been fully decommissioned and
// by definition, non-operational.
decommissionedNodeA := h.getRandNode()
decommissionedNodeB := h.getRandNodeOtherThan(decommissionedNodeA)
h.blockFromRandNode(decommissionedNodeA)
decommissionedNodeB := h.getRandNode() // note A != B
h.blockFromRandNode(decommissionedNodeB)
{
targetNodeA, targetNodeB := decommissionedNodeA, decommissionedNodeB
if targetNodeB < targetNodeA {
Expand Down Expand Up @@ -516,40 +534,51 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
t.Fatal(err)
}

// Check that even though two nodes are decommissioned, we still see
// them (since they remain live) in `node ls`.
// Check that the two decommissioned nodes vanish from `node ls` (since they
// will not remain live as the cluster refuses connections from them).
{
runNode = h.getRandNode()
t.l.Printf("checking that `node ls` (from n%d) shows all nodes\n", runNode)
o, err := execCLI(ctx, t, c, runNode, "node", "ls", "--format=csv")
if err != nil {
t.Fatalf("node-ls failed: %v", err)
}
exp := [][]string{{"id"}}
for i := 1; i <= c.spec.NodeCount; i++ {
exp = append(exp, []string{strconv.Itoa(i)})
}
if err := h.matchCSV(o, exp); err != nil {
if err := retry.WithMaxAttempts(ctx, retry.Options{}, 50, func() error {
runNode = h.getRandNode()
t.l.Printf("checking that `node ls` (from n%d) shows n-2 nodes\n", runNode)
o, err := execCLI(ctx, t, c, runNode, "node", "ls", "--format=csv")
if err != nil {
t.Fatalf("node-ls failed: %v", err)
}
exp := [][]string{{"id"}}
for i := 1; i <= c.spec.NodeCount; i++ {
if _, ok := h.randNodeBlocklist[i]; ok {
// Decommissioned, so should go away in due time.
continue
}
exp = append(exp, []string{strconv.Itoa(i)})
}
return h.matchCSV(o, exp)
}); err != nil {
t.Fatal(err)
}
}

// Ditto for `node status`.
{
runNode = h.getRandNode()
t.l.Printf("checking that `node status` (from n%d) shows all nodes\n", runNode)
t.l.Printf("checking that `node status` (from n%d) shows only live nodes\n", runNode)
o, err := execCLI(ctx, t, c, runNode, "node", "status", "--format=csv")
if err != nil {
t.Fatalf("node-status failed: %v", err)
}

numCols := h.getCsvNumCols(o)
colRegex := []string{}
for i := 1; i <= c.spec.NodeCount; i++ {
colRegex = append(colRegex, strconv.Itoa(i))
}
exp := h.expectIDsInStatusOut(colRegex, numCols)
if err := h.matchCSV(o, exp); err != nil {
if err := retry.WithMaxAttempts(ctx, retry.Options{}, 50, func() error {
colRegex := []string{}
for i := 1; i <= c.spec.NodeCount; i++ {
if _, ok := h.randNodeBlocklist[i]; ok {
continue // decommissioned
}
colRegex = append(colRegex, strconv.Itoa(i))
}
exp := h.expectIDsInStatusOut(colRegex, numCols)
return h.matchCSV(o, exp)
}); err != nil {
t.Fatal(err)
}
}
Expand Down Expand Up @@ -579,8 +608,10 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {

exp := [][]string{
decommissionHeader,
{strconv.Itoa(targetNodeA), "true", "0", "true", "decommissioned", "false"},
{strconv.Itoa(targetNodeB), "true", "0", "true", "decommissioned", "false"},
// NB: the "false" is liveness. We waited above for these nodes to
// vanish from `node ls`, so definitely not live at this point.
{strconv.Itoa(targetNodeA), "false", "0", "true", "decommissioned", "false"},
{strconv.Itoa(targetNodeB), "false", "0", "true", "decommissioned", "false"},
decommissionFooter,
}
if err := h.matchCSV(o, exp); err != nil {
Expand All @@ -599,6 +630,9 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
if _, err := h.recommission(ctx, c.Nodes(targetNodeA, targetNodeB), runNode); err == nil {
t.Fatalf("expected recommission to fail")
}
// Now stop+wipe them for good to keep the logs simple and the dead node detector happy.
c.Stop(ctx, c.Nodes(targetNodeA, targetNodeB))
c.Wipe(ctx, c.Nodes(targetNodeA, targetNodeB))
}
}

Expand All @@ -621,7 +655,7 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
// everything will seem dead to the allocator at all times, so
// nothing will ever happen.
func() {
db := c.Conn(ctx, 1)
db := c.Conn(ctx, h.getRandNode())
defer db.Close()
const stmt = "SET CLUSTER SETTING server.time_until_store_dead = '1m15s'"
if _, err := db.ExecContext(ctx, stmt); err != nil {
Expand All @@ -634,11 +668,14 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
// wiping it below. Roachprod attempts to initialize a cluster when
// starting a "fresh" first node (without an existing bootstrap marker
// on disk, which we happen to also be wiping away).
targetNode := h.getRandNodeOtherThan(decommissionedNodeA, decommissionedNodeB, firstNodeID)
targetNode := h.getRandNodeOtherThan(firstNodeID)
h.blockFromRandNode(targetNode)
t.l.Printf("intentionally killing n%d to later decommission it when down\n", targetNode)
c.Stop(ctx, c.Node(targetNode))

runNode := h.getRandNodeOtherThan(targetNode)
// Pick a runNode that is still in commission and will
// remain so (or it won't be able to contact cluster).
runNode := h.getRandNode()
t.l.Printf("decommissioning n%d (from n%d) in absentia\n", targetNode, runNode)
if _, err := h.decommission(ctx, c.Node(targetNode), runNode,
"--wait=all", "--format=csv"); err != nil {
Expand Down Expand Up @@ -676,14 +713,15 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
// Check that (at least after a bit) the node disappears from `node
// ls` because it is decommissioned and not live.
if err := retry.WithMaxAttempts(ctx, retryOpts, 50, func() error {
runNode := h.getRandNodeOtherThan(targetNode)
runNode := h.getRandNode()
o, err := execCLI(ctx, t, c, runNode, "node", "ls", "--format=csv")
if err != nil {
t.Fatalf("node-ls failed: %v", err)
}

var exp [][]string
for i := 1; i <= c.spec.NodeCount; i++ {
// We expect an entry for every node we haven't decommissioned yet.
for i := 1; i <= c.spec.NodeCount-len(h.randNodeBlocklist); i++ {
exp = append(exp, []string{fmt.Sprintf("[^%d]", targetNode)})
}

Expand All @@ -694,16 +732,15 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {

// Ditto for `node status`
if err := retry.WithMaxAttempts(ctx, retryOpts, 50, func() error {
runNode := h.getRandNodeOtherThan(targetNode)
runNode := h.getRandNode()
o, err := execCLI(ctx, t, c, runNode, "node", "status", "--format=csv")
if err != nil {
t.Fatalf("node-status failed: %v", err)
}

numCols := h.getCsvNumCols(o)
var expC []string
// We're checking for n-1 rows, where n is the node count.
for i := 1; i < c.spec.NodeCount; i++ {
for i := 1; i <= c.spec.NodeCount-len(h.randNodeBlocklist); i++ {
expC = append(expC, fmt.Sprintf("[^%d].*", targetNode))
}
exp := h.expectIDsInStatusOut(expC, numCols)
Expand All @@ -719,7 +756,7 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
c.Stop(ctx, c.Node(targetNode))
c.Wipe(ctx, c.Node(targetNode))

joinNode := targetNode%c.spec.NodeCount + 1
joinNode := h.getRandNode()
joinAddr := c.InternalAddr(ctx, c.Node(joinNode))[0]
c.Start(ctx, t, c.Node(targetNode), startArgs(
fmt.Sprintf("-a=--join %s", joinAddr),
Expand All @@ -733,8 +770,17 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
}
numCols := h.getCsvNumCols(o)
var expC []string
for i := 1; i <= c.spec.NodeCount; i++ {
expC = append(expC, fmt.Sprintf("[^%d].*", targetNode))
// The decommissioned nodes should all disappear. (We
// abuse that nodeIDs are single-digit in this test).
re := `[^`
for id := range h.randNodeBlocklist {
re += fmt.Sprint(id)
}
re += `].*`
// We expect to all the decommissioned nodes to
// disappear, but we let one rejoin as a fresh node.
for i := 1; i <= c.spec.NodeCount-len(h.randNodeBlocklist)+1; i++ {
expC = append(expC, re)
}
exp := h.expectIDsInStatusOut(expC, numCols)
return h.matchCSV(o, exp)
Expand All @@ -748,7 +794,7 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
if err := retry.ForDuration(time.Minute, func() error {
// Verify the event log has recorded exactly one decommissioned or
// recommissioned event for each membership operation.
db := c.Conn(ctx, 1)
db := c.Conn(ctx, h.getRandNode())
defer db.Close()

rows, err := db.Query(`
Expand Down Expand Up @@ -837,6 +883,9 @@ type decommTestHelper struct {
t *test
c *cluster
nodeIDs []int
// randNodeBlocklist are the nodes that won't be returned from randNode().
// populated via blockFromRandNode().
randNodeBlocklist map[int]struct{}
}

func newDecommTestHelper(t *test, c *cluster) *decommTestHelper {
Expand All @@ -845,9 +894,10 @@ func newDecommTestHelper(t *test, c *cluster) *decommTestHelper {
nodeIDs = append(nodeIDs, i)
}
return &decommTestHelper{
t: t,
c: c,
nodeIDs: nodeIDs,
t: t,
c: c,
nodeIDs: nodeIDs,
randNodeBlocklist: map[int]struct{}{},
}
}

Expand Down Expand Up @@ -1021,23 +1071,36 @@ func (h *decommTestHelper) expectIDsInStatusOut(ids []string, numCols int) [][]s
return res
}

// blockFromRandNode ensures the node isn't returned from getRandNode{,OtherThan)
// in the future.
func (h *decommTestHelper) blockFromRandNode(id int) {
h.randNodeBlocklist[id] = struct{}{}
}

// getRandNode returns a random node that hasn't
// been passed to blockFromRandNode() earlier.
func (h *decommTestHelper) getRandNode() int {
return h.nodeIDs[rand.Intn(len(h.nodeIDs))]
return h.getRandNodeOtherThan()
}

// getRandNodeOtherThan is like getRandNode, but additionally
// avoids returning the specified ids.
func (h *decommTestHelper) getRandNodeOtherThan(ids ...int) int {
m := map[int]struct{}{}
for _, id := range ids {
m[id] = struct{}{}
}
for id := range h.randNodeBlocklist {
m[id] = struct{}{}
}
if len(m) == len(h.nodeIDs) {
h.t.Fatal("all nodes are blocked")
}
for {
cur := h.nodeIDs[rand.Intn(len(h.nodeIDs))]
inBlockList := false
for _, id := range ids {
if cur == id {
inBlockList = true
}
}
if inBlockList {
continue
id := h.nodeIDs[rand.Intn(len(h.nodeIDs))]
if _, ok := m[id]; !ok {
return id
}
return cur
}
}

Expand Down

0 comments on commit a00ffe5

Please sign in to comment.