From e4758c27b20e41d88fee834431696ce986f3444b Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Thu, 31 Jan 2019 18:04:48 -0500 Subject: [PATCH 1/2] gossip: demonstrate problem with loopback info propagation Release note: None --- pkg/gossip/gossip_test.go | 71 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/pkg/gossip/gossip_test.go b/pkg/gossip/gossip_test.go index 771e2aff6105..0619e78aa9cd 100644 --- a/pkg/gossip/gossip_test.go +++ b/pkg/gossip/gossip_test.go @@ -805,3 +805,74 @@ func TestGossipPropagation(t *testing.T) { return nil }) } + +// Test whether propagation of an info that was generated by a prior +// incarnation of a server can correctly be sent back to that originating +// server. Consider the scenario: +// +// n1: decommissioned +// n2: gossip node-liveness:1 +// n3: node-liveness range lease acquired (does not gossip node-liveness:1 +// record because it is unchanged) +// n2: restarted +// - connects as gossip client to n3 +// - sends a batch of gossip records to n3 +// - n3 responds without sending node-liveness:1 because it's +// OrigStamp is less than the highwater stamp from n2 +func TestGossipLoopbackInfoPropagation(t *testing.T) { + defer leaktest.AfterTest(t)() + t.Skipf("#34494") + stopper := stop.NewStopper() + defer stopper.Stop(context.TODO()) + + local := startGossip(1, stopper, t, metric.NewRegistry()) + remote := startGossip(2, stopper, t, metric.NewRegistry()) + remote.mu.Lock() + rAddr := remote.mu.is.NodeAddr + remote.mu.Unlock() + local.manage() + remote.manage() + + // Add a gossip info for "foo" on remote, that was generated by local. This + // simulates what happens if local was to gossip an info, and later restart + // and never gossip that info again. + func() { + local.mu.Lock() + defer local.mu.Unlock() + remote.mu.Lock() + defer remote.mu.Unlock() + // NB: replacing local.mu.is.newInfo with remote.mu.is.newInfo allows "foo" + // to be propagated. + if err := remote.mu.is.addInfo("foo", local.mu.is.newInfo(nil, 0)); err != nil { + t.Fatal(err) + } + }() + + // Add an info to local so that it has a highwater timestamp that is newer + // than the info we added to remote. NB: commenting out this line allows + // "foo" to be propagated. + if err := local.AddInfo("bar", nil, 0); err != nil { + t.Fatal(err) + } + + // Start a client connection to the remote node. + local.mu.Lock() + local.startClientLocked(&rAddr) + local.mu.Unlock() + + getInfo := func(g *Gossip, key string) *Info { + g.mu.RLock() + defer g.mu.RUnlock() + return g.mu.is.Infos[key] + } + + testutils.SucceedsSoon(t, func() error { + if getInfo(remote, "bar") == nil { + return fmt.Errorf("bar not propagated") + } + if getInfo(local, "foo") == nil { + return fmt.Errorf("foo not propagated") + } + return nil + }) +} From 4034f80b96c5c80f162de72d45bc44a5d8702b40 Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Fri, 1 Feb 2019 11:20:06 -0500 Subject: [PATCH 2/2] storage: always gossip node liveness after lease changes hands Similar to #17285. We need to always gossip node liveness infos after the node-liveness range lease changes hands due to the way gossip handles propagation of infos which "loopback" to the sender. Due to the highwater stamp mechanism, such infos are never sent back to the sender. If the gossip info is subsequently never re-gossiped, a node in the cluster may never see the info. This is only a problem for gossip keys which can be gossiped from multiple nodes, such as the node liveness keys and the system config key. The other keys which can be gossiped from multiple nodes are the table-stats-added key, and the table-disable-merges key, neither of which appear to have problematic scenarios associated with the failure to propagate them back to the sender after a restart. See #34494 Release note: None --- pkg/storage/client_lease_test.go | 49 ++++++++++++++++++++++++++++++++ pkg/storage/replica_gossip.go | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/pkg/storage/client_lease_test.go b/pkg/storage/client_lease_test.go index ab7dcf70a8b3..11acb0996653 100644 --- a/pkg/storage/client_lease_test.go +++ b/pkg/storage/client_lease_test.go @@ -17,6 +17,7 @@ package storage_test import ( "context" "errors" + "fmt" "reflect" "testing" @@ -29,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/storagepb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" ) // TestStoreRangeLease verifies that regular ranges (not some special ones at @@ -257,3 +259,50 @@ func TestGossipSystemConfigOnLeaseChange(t *testing.T) { return nil }) } + +func TestGossipNodeLivenessOnLeaseChange(t *testing.T) { + defer leaktest.AfterTest(t)() + sc := storage.TestStoreConfig(nil) + sc.TestingKnobs.DisableReplicateQueue = true + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + const numStores = 3 + mtc.Start(t, numStores) + + rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(keys.NodeLivenessSpan.Key)).RangeID + mtc.replicateRange(rangeID, 1, 2) + + // Turn off liveness heartbeats on all nodes to ensure that updates to node + // liveness are not triggering gossiping. + for i := range mtc.nodeLivenesses { + mtc.nodeLivenesses[i].PauseHeartbeat(true) + } + + nodeLivenessKey := gossip.MakeNodeLivenessKey(1) + + initialStoreIdx := -1 + for i := range mtc.stores { + if mtc.stores[i].Gossip().InfoOriginatedHere(nodeLivenessKey) { + initialStoreIdx = i + } + } + if initialStoreIdx == -1 { + t.Fatalf("no store has gossiped %s; gossip contents: %+v", + nodeLivenessKey, mtc.stores[0].Gossip().GetInfoStatus()) + } + log.Infof(context.Background(), "%s gossiped from n%d", + nodeLivenessKey, mtc.stores[initialStoreIdx].Ident.NodeID) + + newStoreIdx := (initialStoreIdx + 1) % numStores + mtc.transferLease(context.Background(), rangeID, initialStoreIdx, newStoreIdx) + + testutils.SucceedsSoon(t, func() error { + if mtc.stores[initialStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) { + return fmt.Errorf("%s still most recently gossiped by original leaseholder", nodeLivenessKey) + } + if !mtc.stores[newStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) { + return fmt.Errorf("%s not most recently gossiped by new leaseholder", nodeLivenessKey) + } + return nil + }) +} diff --git a/pkg/storage/replica_gossip.go b/pkg/storage/replica_gossip.go index 04472a2be867..26419c0955dc 100644 --- a/pkg/storage/replica_gossip.go +++ b/pkg/storage/replica_gossip.go @@ -156,7 +156,7 @@ func (r *Replica) MaybeGossipNodeLiveness(ctx context.Context, span roachpb.Span key := gossip.MakeNodeLivenessKey(kvLiveness.NodeID) // Look up liveness from gossip; skip gossiping anew if unchanged. if err := r.store.Gossip().GetInfoProto(key, &gossipLiveness); err == nil { - if gossipLiveness == kvLiveness { + if gossipLiveness == kvLiveness && r.store.Gossip().InfoOriginatedHere(key) { continue } }