From d31e1cb79984c36f6c3f32aa7ef26b0bb997e4f7 Mon Sep 17 00:00:00 2001 From: fanmin shi Date: Thu, 4 May 2017 15:57:25 -0700 Subject: [PATCH] etcdserver: renaming db happens before snapshot persists to wal and snap files In the case that follower recieves a snapshot from leader and crashes before renaming xxx.snap.db to db but after snapshot has persisted to .wal and .snap, restarting follower results loading old db, new .wal, and new .snap. This will causes a index mismatch between snap metadata index and consistent index from db. This pr forces an ordering where saving/renaming db must happen before snapshot is persisted to wal and snap file. this ensures that db file can never be newer than wal and snap file. hence, it guarantees the invariant snapshot.Metadata.Index <= db.ConsistentIndex() in NewServer() when checking validity of db and snap file. FIXES #7628 --- etcdserver/raft.go | 17 +++++++++++------ etcdserver/server.go | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index b87ceea54660..f0c353f432c7 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -81,9 +81,10 @@ type RaftTimer interface { // to raft storage concurrently; the application must read // raftDone before assuming the raft messages are stable. type apply struct { - entries []raftpb.Entry - snapshot raftpb.Snapshot - raftDone <-chan struct{} // rx {} after raft has persisted messages + entries []raftpb.Entry + snapshot raftpb.Snapshot + raftDone <-chan struct{} // rx {} after raft has persisted messages + replaceDBDone chan struct{} // snapshot db from leader has replaced the current db } type raftNode struct { @@ -191,10 +192,12 @@ func (r *raftNode) start(rh *raftReadyHandler) { } raftDone := make(chan struct{}, 1) + replaceDBDone := make(chan struct{}, 1) ap := apply{ - entries: rd.CommittedEntries, - snapshot: rd.Snapshot, - raftDone: raftDone, + entries: rd.CommittedEntries, + snapshot: rd.Snapshot, + raftDone: raftDone, + replaceDBDone: replaceDBDone, } updateCommittedIndex(&ap, rh) @@ -223,6 +226,8 @@ func (r *raftNode) start(rh *raftReadyHandler) { // gofail: var raftAfterSave struct{} if !raft.IsEmptySnap(rd.Snapshot) { + // waits etcd server to finish renaming snap db to db. + <-replaceDBDone // gofail: var raftBeforeSaveSnap struct{} if err := r.storage.SaveSnap(rd.Snapshot); err != nil { plog.Fatalf("raft save snapshot error: %v", err) diff --git a/etcdserver/server.go b/etcdserver/server.go index 33430276bff6..226b543c1292 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -812,6 +812,8 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) { if err := os.Rename(snapfn, fn); err != nil { plog.Panicf("rename snapshot file error: %v", err) } + // notifies raftNode that db has been replaced. + apply.replaceDBDone <- struct{}{} newbe := newBackend(fn, s.Cfg.QuotaBackendBytes)