From 96b8a4b465c69e91de34b8434bdb4f9c035c209b Mon Sep 17 00:00:00 2001 From: James Shubin Date: Wed, 27 Mar 2019 15:24:33 -0400 Subject: [PATCH] etcdserver: Use panic instead of fatal on no space left error When using the embed package to embed etcd, sometimes the storage prefix being used might be full. In this case, this code path triggers, causing an: `etcdserver: create wal error: no space left on device` error, which causes a fatal. A fatal differs from a panic in that it also calls os.Exit(1). In this situation, the calling program that embeds the etcd server will be abruptly killed, which prevents it from cleaning up safely, and giving a proper error message. Depending on what the calling program is, this can cause corruption and data loss. This patch switches the fatal to a panic. Ideally this would be a regular error which would get propagated upwards to the StartEtcd command, but in the meantime at least this can be caught with recover(). This fixes the most common fatal that I've experienced, but there are surely more that need looking into. If possible, the errors should be threaded down into the code path so that embedding etcd can be more robust. Fixes: https://github.com/etcd-io/etcd/issues/10588 --- etcdserver/raft.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 1ff137642d5e..3cf4064cafa7 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -427,9 +427,9 @@ func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id ) if w, err = wal.Create(cfg.Logger, cfg.WALDir(), metadata); err != nil { if cfg.Logger != nil { - cfg.Logger.Fatal("failed to create WAL", zap.Error(err)) + cfg.Logger.Panic("failed to create WAL", zap.Error(err)) } else { - plog.Fatalf("create wal error: %v", err) + plog.Panicf("create wal error: %v", err) } } peers := make([]raft.Peer, len(ids))