From 8885e3c23ad9af19a0d061d9f455c0146b0cedfe Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Wed, 7 Jul 2021 12:21:43 -0400 Subject: [PATCH] migration,keys: fix incorrect range descriptor iteration The long-running migrations infrastructure exposes an `IterateRangeDescriptors` interface to paginate through all the ranges in the system. It was previously doing this (erroneously) by scanning through the meta2 range. It was thus possible to entirely miss descriptors for the meta ranges themselves. This could only happen for large enough clusters with splits in the meta range. This issue was caught during the careful review of #66445. We remedy the situation by scanning over the entire `[MetaMin, MetaMax)` span instead. One thing to take care of is the possibility of finding the same range descriptor in both the `meta1` and `meta2` range. This is possible when the first range in the system includes part of the user keyspace (typically `[/Min, /System/NodeLiveness)`). We de-dup these descriptors away by maintaining an in-memory map of the range IDs seen in meta1. --- What does this mean for existing long running migrations? There was only one that made use of this descriptor iteration API: the truncated state migration. Fortunately that migration was a below-raft one, so as part of it we purged all extant ranges that hadn't seen the corresponding `Migrate` command (see `postTruncatedStateMigration` and `PurgeOutdatedReplicas`; the purge attempt reaches out to every store and ensures every replica has been migrated, and if not, has been GC-ed). If we had found unmigrated ranges (possible for the meta ranges as described above), the migration would be blocked. We haven't seen this happen with users running 21.1, which makes sense since this bug is only possible in clusters with enough ranges to necessitate a split in the meta range (very unlikely with our large default range size of 512 MB). Still, we'll backport this PR to 21.1. Should users run into this stalled migration, we'll recommend upgrading to whatever patch release this will be part of. Release note: None Release justification: bug fixes and low-risk updates to new functionality --- pkg/keys/keys.go | 5 ++ pkg/migration/migrationcluster/BUILD.bazel | 1 + pkg/migration/migrationcluster/client_test.go | 77 ++++++++++++------- pkg/migration/migrationcluster/cluster.go | 55 ++++++++----- 4 files changed, 89 insertions(+), 49 deletions(-) diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go index a0a1b421968e..909ced943dee 100644 --- a/pkg/keys/keys.go +++ b/pkg/keys/keys.go @@ -657,6 +657,11 @@ func UserKey(key roachpb.RKey) roachpb.RKey { return buf } +// InMeta1 returns true iff a key is in the meta1 range (which includes RKeyMin). +func InMeta1(k roachpb.RKey) bool { + return k.Equal(roachpb.RKeyMin) || bytes.HasPrefix(k, MustAddr(Meta1Prefix)) +} + // validateRangeMetaKey validates that the given key is a valid Range Metadata // key. This checks only the constraints common to forward and backwards scans: // correct prefix and not exceeding KeyMax. diff --git a/pkg/migration/migrationcluster/BUILD.bazel b/pkg/migration/migrationcluster/BUILD.bazel index 9267df9b6b1f..71e51d5e48ab 100644 --- a/pkg/migration/migrationcluster/BUILD.bazel +++ b/pkg/migration/migrationcluster/BUILD.bazel @@ -36,6 +36,7 @@ go_test( ], embed = [":migrationcluster"], deps = [ + "//pkg/keys", "//pkg/kv/kvserver", "//pkg/migration/nodelivenesstest", "//pkg/roachpb", diff --git a/pkg/migration/migrationcluster/client_test.go b/pkg/migration/migrationcluster/client_test.go index 3977e779bc2d..2ee48cb256b7 100644 --- a/pkg/migration/migrationcluster/client_test.go +++ b/pkg/migration/migrationcluster/client_test.go @@ -12,8 +12,10 @@ package migrationcluster_test import ( "context" + "fmt" "testing" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/migration/migrationcluster" "github.com/cockroachdb/cockroach/pkg/migration/nodelivenesstest" @@ -23,43 +25,60 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" ) -func TestCluster_IterateRangeDescriptors(t *testing.T) { +func TestClusterIterateRangeDescriptors(t *testing.T) { defer leaktest.AfterTest(t)() ctx := context.Background() const numNodes = 1 - params, _ := tests.CreateTestServerParams() - server, _, kvDB := serverutils.StartServer(t, params) - defer server.Stopper().Stop(context.Background()) + for _, splits := range [][]roachpb.Key{ + {}, // no splits + {keys.Meta2Prefix}, // split between meta1 and meta2 + {keys.SystemPrefix}, // split after the meta range + {keys.Meta2Prefix, keys.SystemPrefix}, // split before and after meta2 + {keys.RangeMetaKey(roachpb.RKey("middle")).AsRawKey()}, // split within meta2 + {keys.Meta2Prefix, keys.RangeMetaKey(roachpb.RKey("middle")).AsRawKey()}, // split at start of and within meta2 + } { + t.Run(fmt.Sprintf("with-splits-at=%s", splits), func(t *testing.T) { + params, _ := tests.CreateTestServerParams() + server, _, kvDB := serverutils.StartServer(t, params) + defer server.Stopper().Stop(context.Background()) - var numRanges int - if err := server.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error { - numRanges = s.ReplicaCount() - return nil - }); err != nil { - t.Fatal(err) - } + for _, split := range splits { + if _, _, err := server.SplitRange(split); err != nil { + t.Fatal(err) + } + } + + var numRanges int + if err := server.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error { + numRanges = s.ReplicaCount() + return nil + }); err != nil { + t.Fatal(err) + } - c := nodelivenesstest.New(numNodes) - h := migrationcluster.New(migrationcluster.ClusterConfig{ - NodeLiveness: c, - Dialer: migrationcluster.NoopDialer{}, - DB: kvDB, - }) + c := nodelivenesstest.New(numNodes) + h := migrationcluster.New(migrationcluster.ClusterConfig{ + NodeLiveness: c, + Dialer: migrationcluster.NoopDialer{}, + DB: kvDB, + }) - for _, blockSize := range []int{1, 5, 10, 50} { - var numDescs int - init := func() { numDescs = 0 } - if err := h.IterateRangeDescriptors(ctx, blockSize, init, func(descriptors ...roachpb.RangeDescriptor) error { - numDescs += len(descriptors) - return nil - }); err != nil { - t.Fatal(err) - } + for _, blockSize := range []int{1, 5, 10, 50} { + var numDescs int + init := func() { numDescs = 0 } + if err := h.IterateRangeDescriptors(ctx, blockSize, init, func(descriptors ...roachpb.RangeDescriptor) error { + numDescs += len(descriptors) + return nil + }); err != nil { + t.Fatal(err) + } - if numDescs != numRanges { - t.Fatalf("expected to find %d ranges, found %d", numRanges+1, numDescs) - } + if numDescs != numRanges { + t.Fatalf("expected to find %d ranges, found %d", numRanges, numDescs) + } + } + }) } } diff --git a/pkg/migration/migrationcluster/cluster.go b/pkg/migration/migrationcluster/cluster.go index 7a9d2490d18a..6a5aac6ddfba 100644 --- a/pkg/migration/migrationcluster/cluster.go +++ b/pkg/migration/migrationcluster/cluster.go @@ -134,37 +134,52 @@ func (c *Cluster) ForEveryNode( func (c *Cluster) IterateRangeDescriptors( ctx context.Context, blockSize int, init func(), fn func(...roachpb.RangeDescriptor) error, ) error { - if err := c.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { + return c.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { // Inform the caller that we're starting a fresh attempt to page in // range descriptors. init() - // Iterate through meta2 to pull out all the range descriptors. - return txn.Iterate(ctx, keys.Meta2Prefix, keys.MetaMax, blockSize, + // Iterate through meta{1,2} to pull out all the range descriptors. + var lastRangeIDInMeta1 roachpb.RangeID + return txn.Iterate(ctx, keys.MetaMin, keys.MetaMax, blockSize, func(rows []kv.KeyValue) error { - descriptors := make([]roachpb.RangeDescriptor, len(rows)) - for i, row := range rows { - if err := row.ValueProto(&descriptors[i]); err != nil { - return errors.Wrapf(err, - "unable to unmarshal range descriptor from %s", - row.Key, - ) + descriptors := make([]roachpb.RangeDescriptor, 0, len(rows)) + var desc roachpb.RangeDescriptor + for _, row := range rows { + err := row.ValueProto(&desc) + if err != nil { + return errors.Wrapf(err, "unable to unmarshal range descriptor from %s", row.Key) + } + + // In small enough clusters it's possible for the same range + // descriptor to be stored in both meta1 and meta2. This + // happens when some range spans both the meta and the user + // keyspace. Consider when r1 is [/Min, + // /System/NodeLiveness); we'll store the range descriptor + // in both /Meta2/ and in /Meta1/KeyMax[1]. + // + // As part of iterator we'll de-duplicate this descriptor + // away by checking whether we've seen it before in meta1. + // Since we're scanning over the meta range in sorted + // order, it's enough to check against the last range + // descriptor we've seen in meta1. + // + // [1]: See kvserver.rangeAddressing. + if desc.RangeID == lastRangeIDInMeta1 { + continue + } + + descriptors = append(descriptors, desc) + if keys.InMeta1(keys.RangeMetaKey(desc.StartKey)) { + lastRangeIDInMeta1 = desc.RangeID } } // Invoke fn with the current chunk (of size ~blockSize) of // range descriptors. - if err := fn(descriptors...); err != nil { - return err - } - - return nil + return fn(descriptors...) }) - }); err != nil { - return err - } - - return nil + }) } // DB provides exposes the underlying *kv.DB instance.