Skip to content

Commit

Permalink
Merge pull request #9564 from gyuho/membership-reconfiguration-tests
Browse files Browse the repository at this point in the history
functional: add membership reconfiguration tests
  • Loading branch information
gyuho authored Apr 11, 2018
2 parents f46368c + 512445f commit c77ffcc
Show file tree
Hide file tree
Showing 11 changed files with 576 additions and 191 deletions.
8 changes: 8 additions & 0 deletions functional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,18 @@ tester-config:

failure-delay-ms: 7000
failure-shuffle: true

# For full descriptions,
# https://godoc.org/github.com/coreos/etcd/functional/rpcpb#FailureCase
failure-cases:
- SIGTERM_ONE_FOLLOWER
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_LEADER
- SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_QUORUM
- SIGTERM_ALL
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_LEADER
Expand All @@ -149,6 +154,9 @@ tester-config:
- NO_FAIL_WITH_STRESS
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS

# - SIGQUIT_AND_REMOVE_LEADER
# - SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT

failpoint-commands:
- panic("etcd-tester")
# - panic("etcd-tester"),1*sleep(1000)
Expand Down
36 changes: 30 additions & 6 deletions functional/agent/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Respons
}
srv.lg.Info("created base directory", zap.String("path", srv.Member.BaseDir))

if err = srv.saveEtcdLogFile(); err != nil {
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}

Expand Down Expand Up @@ -215,7 +215,7 @@ func (srv *Server) stopProxy() {
}
}

func (srv *Server) saveEtcdLogFile() error {
func (srv *Server) createEtcdLogFile() error {
var err error
srv.etcdLogFile, err = os.Create(srv.Member.EtcdLogPath)
if err != nil {
Expand Down Expand Up @@ -469,11 +469,32 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))

err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()

// for debugging purposes, rename instead of removing
if err = os.RemoveAll(srv.Member.BaseDir + ".backup"); err != nil {
return nil, err
}
if err = os.Rename(srv.Member.BaseDir, srv.Member.BaseDir+".backup"); err != nil {
return nil, err
}
srv.lg.Info(
"renamed",
zap.String("base-dir", srv.Member.BaseDir),
zap.String("new-dir", srv.Member.BaseDir+".backup"),
)

// create a new log file for next new member restart
if !fileutil.Exist(srv.Member.BaseDir) {
err = fileutil.TouchDirAll(srv.Member.BaseDir)
if err != nil {
return nil, err
}
}
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}
srv.lg.Info("removed base directory", zap.String("dir", srv.Member.BaseDir))

return &rpcpb.Response{
Success: true,
Expand Down Expand Up @@ -504,7 +525,7 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, erro
}
srv.lg.Info("archived data", zap.String("base-dir", srv.Member.BaseDir))

if err = srv.saveEtcdLogFile(); err != nil {
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}

Expand All @@ -530,6 +551,9 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))

srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()

err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
return nil, err
Expand Down
8 changes: 4 additions & 4 deletions functional/build
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ if ! [[ "$0" =~ "functional/build" ]]; then
exit 255
fi

CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester
366 changes: 210 additions & 156 deletions functional/rpcpb/rpc.pb.go

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions functional/rpcpb/rpc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,54 @@ enum FailureCase {
// must be able to process client requests.
SIGTERM_ALL = 5;

// SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
// (non-leader), deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// with fresh data. It waits "failure-delay-ms" before recovering this
// failure. This simulates destroying one follower machine, where operator
// needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;

// SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
// chosen follower, deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// restart. On member remove, cluster waits until most up-to-date node
// (leader) applies the snapshot count of entries since the stop operation.
// This simulates destroying a leader machine, where operator needs to add
// a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and receives a snapshot from the active leader. As always, after
// recovery, each member must be able to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;

// SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
// data directories on disk, and removes this member from cluster.
// On recovery, tester adds a new member, and this member joins the
// existing cluster with fresh data. It waits "failure-delay-ms" before
// recovering this failure. This simulates destroying a leader machine,
// where operator needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_LEADER = 12;

// SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
// deletes its data directories on disk, and removes this member from
// cluster (membership reconfiguration). On recovery, tester adds a new
// member, and this member joins the existing cluster restart. On member
// remove, cluster waits until most up-to-date node (new leader) applies
// the snapshot count of entries since the stop operation. This simulates
// destroying a leader machine, where operator needs to add a new member
// from a fresh machine.
// The expected behavior is that on member remove, cluster elects a new
// leader, and a new member joins the existing cluster and receives a
// snapshot from the newly elected leader. As always, after recovery, each
// member must be able to process client requests.
SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;

// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader), and waits for "failure-delay-ms" until recovery.
Expand Down
27 changes: 19 additions & 8 deletions functional/tester/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,19 @@ func (clus *Cluster) updateFailures() {
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_ALL(clus))

case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGQUIT_AND_REMOVE_LEADER":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_LEADER(clus))
case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))

case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
Expand Down Expand Up @@ -377,14 +390,12 @@ func (clus *Cluster) broadcast(op rpcpb.Operation) error {
}

func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
if op == rpcpb.Operation_INITIAL_START_ETCD {
clus.agentRequests[idx] = &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
Tester: clus.Tester,
}
} else {
clus.agentRequests[idx].Operation = op
// maintain the initial member object
// throughout the test time
clus.agentRequests[idx] = &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
Tester: clus.Tester,
}

err := clus.agentStreams[idx].Send(clus.agentRequests[idx])
Expand Down
7 changes: 7 additions & 0 deletions functional/tester/cluster_read_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
}

switch mem.Etcd.InitialClusterState {
case "new":
case "existing":
default:
return nil, fmt.Errorf("'--initial-cluster-state' got %q", mem.Etcd.InitialClusterState)
}

if mem.Etcd.HeartbeatIntervalMs == 0 {
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
}
Expand Down
4 changes: 4 additions & 0 deletions functional/tester/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ func Test_read(t *testing.T) {
"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_QUORUM",
"SIGTERM_ALL",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
// "SIGQUIT_AND_REMOVE_LEADER",
// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
Expand Down
Loading

0 comments on commit c77ffcc

Please sign in to comment.