From d5350c119ddb39c33b36f980e224064104de4c36 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 29 May 2020 13:03:54 -0400 Subject: [PATCH] Handle nil/empty cluster metadata Handle case where a snapshot is made before cluster metadata is created. This fixes a bug where a server may have empty cluster metadata if it created and installed a Raft snapshot before a new cluster metadata ID is generated. This case is very unlikely to arise. Most likely reason is when upgrading from an old version slowly where servers may use snapshots before all servers upgrade. This happened for a user with a log line like: ``` 2020-05-21T15:21:56.996Z [ERROR] nomad.fsm: ClusterSetMetadata failed: error=""set cluster metadata failed: refusing to set new cluster id, previous: , new: < ``` --- nomad/fsm.go | 6 ++++++ nomad/state/state_store.go | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nomad/fsm.go b/nomad/fsm.go index 5ce00d3884f..39b4cf8f06f 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -2140,6 +2140,9 @@ func (s *nomadSnapshot) persistSchedulerConfig(sink raft.SnapshotSink, if err != nil { return err } + if schedConfig == nil { + return nil + } // Write out scheduler config sink.Write([]byte{byte(SchedulerConfigSnapshot)}) if err := encoder.Encode(schedConfig); err != nil { @@ -2156,6 +2159,9 @@ func (s *nomadSnapshot) persistClusterMetadata(sink raft.SnapshotSink, if err != nil { return err } + if clusterMetadata == nil { + return nil + } // Write out the cluster metadata sink.Write([]byte{byte(ClusterMetadataSnapshot)}) diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 9b0eca9cc16..81cba1edec8 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -5114,7 +5114,7 @@ func (s *StateStore) setClusterMetadata(txn *memdb.Txn, meta *structs.ClusterMet if existing != nil { existingClusterID := existing.(*structs.ClusterMetadata).ClusterID - if meta.ClusterID != existingClusterID { + if meta.ClusterID != existingClusterID && existingClusterID != "" { // there is a bug in cluster ID detection return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID) }