Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

functional: add membership reconfiguration tests #9564

Merged
merged 17 commits into from
Apr 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions functional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,18 @@ tester-config:

failure-delay-ms: 7000
failure-shuffle: true

# For full descriptions,
# https://godoc.org/github.com/coreos/etcd/functional/rpcpb#FailureCase
failure-cases:
- SIGTERM_ONE_FOLLOWER
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_LEADER
- SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_QUORUM
- SIGTERM_ALL
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_LEADER
Expand All @@ -149,6 +154,9 @@ tester-config:
- NO_FAIL_WITH_STRESS
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS

# - SIGQUIT_AND_REMOVE_LEADER
# - SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT

failpoint-commands:
- panic("etcd-tester")
# - panic("etcd-tester"),1*sleep(1000)
Expand Down
36 changes: 30 additions & 6 deletions functional/agent/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Respons
}
srv.lg.Info("created base directory", zap.String("path", srv.Member.BaseDir))

if err = srv.saveEtcdLogFile(); err != nil {
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}

Expand Down Expand Up @@ -215,7 +215,7 @@ func (srv *Server) stopProxy() {
}
}

func (srv *Server) saveEtcdLogFile() error {
func (srv *Server) createEtcdLogFile() error {
var err error
srv.etcdLogFile, err = os.Create(srv.Member.EtcdLogPath)
if err != nil {
Expand Down Expand Up @@ -469,11 +469,32 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))

err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()

// for debugging purposes, rename instead of removing
if err = os.RemoveAll(srv.Member.BaseDir + ".backup"); err != nil {
return nil, err
}
if err = os.Rename(srv.Member.BaseDir, srv.Member.BaseDir+".backup"); err != nil {
return nil, err
}
srv.lg.Info(
"renamed",
zap.String("base-dir", srv.Member.BaseDir),
zap.String("new-dir", srv.Member.BaseDir+".backup"),
)

// create a new log file for next new member restart
if !fileutil.Exist(srv.Member.BaseDir) {
err = fileutil.TouchDirAll(srv.Member.BaseDir)
if err != nil {
return nil, err
}
}
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}
srv.lg.Info("removed base directory", zap.String("dir", srv.Member.BaseDir))

return &rpcpb.Response{
Success: true,
Expand Down Expand Up @@ -504,7 +525,7 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, erro
}
srv.lg.Info("archived data", zap.String("base-dir", srv.Member.BaseDir))

if err = srv.saveEtcdLogFile(); err != nil {
if err = srv.createEtcdLogFile(); err != nil {
return nil, err
}

Expand All @@ -530,6 +551,9 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))

srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()

err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
return nil, err
Expand Down
8 changes: 4 additions & 4 deletions functional/build
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ if ! [[ "$0" =~ "functional/build" ]]; then
exit 255
fi

CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
CGO_ENABLED=0 go build -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester
366 changes: 210 additions & 156 deletions functional/rpcpb/rpc.pb.go

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions functional/rpcpb/rpc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,54 @@ enum FailureCase {
// must be able to process client requests.
SIGTERM_ALL = 5;

// SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
// (non-leader), deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// with fresh data. It waits "failure-delay-ms" before recovering this
// failure. This simulates destroying one follower machine, where operator
// needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;

// SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
// chosen follower, deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// restart. On member remove, cluster waits until most up-to-date node
// (leader) applies the snapshot count of entries since the stop operation.
// This simulates destroying a leader machine, where operator needs to add
// a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and receives a snapshot from the active leader. As always, after
// recovery, each member must be able to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;

// SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
// data directories on disk, and removes this member from cluster.
// On recovery, tester adds a new member, and this member joins the
// existing cluster with fresh data. It waits "failure-delay-ms" before
// recovering this failure. This simulates destroying a leader machine,
// where operator needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_LEADER = 12;

// SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
// deletes its data directories on disk, and removes this member from
// cluster (membership reconfiguration). On recovery, tester adds a new
// member, and this member joins the existing cluster restart. On member
// remove, cluster waits until most up-to-date node (new leader) applies
// the snapshot count of entries since the stop operation. This simulates
// destroying a leader machine, where operator needs to add a new member
// from a fresh machine.
// The expected behavior is that on member remove, cluster elects a new
// leader, and a new member joins the existing cluster and receives a
// snapshot from the newly elected leader. As always, after recovery, each
// member must be able to process client requests.
SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;

// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader), and waits for "failure-delay-ms" until recovery.
Expand Down
27 changes: 19 additions & 8 deletions functional/tester/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,19 @@ func (clus *Cluster) updateFailures() {
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_ALL(clus))

case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGQUIT_AND_REMOVE_LEADER":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_LEADER(clus))
case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))

case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
Expand Down Expand Up @@ -377,14 +390,12 @@ func (clus *Cluster) broadcast(op rpcpb.Operation) error {
}

func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
if op == rpcpb.Operation_INITIAL_START_ETCD {
clus.agentRequests[idx] = &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
Tester: clus.Tester,
}
} else {
clus.agentRequests[idx].Operation = op
// maintain the initial member object
// throughout the test time
clus.agentRequests[idx] = &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
Tester: clus.Tester,
}

err := clus.agentStreams[idx].Send(clus.agentRequests[idx])
Expand Down
7 changes: 7 additions & 0 deletions functional/tester/cluster_read_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
}

switch mem.Etcd.InitialClusterState {
case "new":
case "existing":
default:
return nil, fmt.Errorf("'--initial-cluster-state' got %q", mem.Etcd.InitialClusterState)
}

if mem.Etcd.HeartbeatIntervalMs == 0 {
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
}
Expand Down
4 changes: 4 additions & 0 deletions functional/tester/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ func Test_read(t *testing.T) {
"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_QUORUM",
"SIGTERM_ALL",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
// "SIGQUIT_AND_REMOVE_LEADER",
// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
Expand Down
Loading