Skip to content

Commit

Permalink
kvserver: kill TestSystemZoneConfigs
Browse files Browse the repository at this point in the history
Fixes #98200. This test was written pre-spanconfig days, and when
enabling spanconfigs by default over a year ago, opted out of using it.
It's a real chore to bring this old test back up to spec (#100210 is an
earlier attempt). It has been skipped for a while after flaking (for
test-only reasons that are understood, see #100210) and is notoriously
slow taking 30+s given it waits for actual upreplication and replica
movement, making it not --stress friendly.

In our earlier attempt to upgrade this to use spanconfigs, we learnt two
new things:

- There was a latent bug, previously thought to have been fixed in
  #75939. In very rare cases, right during cluster bootstrap before the
  span config reconciler has ever had a chance to run (i.e.
  system.span_configurations is empty), it was possible that the
  subscriber had subscribed to an empty span config state (we've only
  seen this happen in unit tests with 50ms scan intervals). So it was
  not been meaningfully "updated" in any sense of the word, but we still
  previously set a non-empty last-updated timestamp, something various
  components in KV rely on as proof that we have span configs as of some
  timestamp. As a result, we saw KV incorrectly merge away the liveness
  range into adjacent ranges, and then later split it off. We don't
  think we've ever seen this happen outside of tests as it instantly
  triggers the following fatal in the raftScheduler, which wants to
  prioritize the liveness range above all else:
    panic: priority range ID already set: old=2, new=61, first set at:
  This bug continues to exist. We've filed #104195 to track fixing it.

- Fixing the bug above (by erroring out until a span config snapshot is
  available) made it so that tests now needed to actively wait for a
  span config snapshot before relocating ranges manually or using
  certain kv queues. Adding that synchronization made lots of tests a
  whole lot slower (by 3+s each) despite reducing the closed timestamp
  interval, etc. These tests weren't really being harmed by the bug (==
  empty span config snapshot). So it's not clear that the bug fix is
  worth fixing. But that can be litigated in #104195.

We don't really need this test in this current form (end-to-end
spanconfig tests exist elsewhere and are more comprehensive without
suffering the issues above).

Release note: None
  • Loading branch information
irfansharif committed Jun 27, 2023
1 parent 906e7ad commit 8395a21
Showing 1 changed file with 0 additions and 114 deletions.
114 changes: 0 additions & 114 deletions pkg/kv/kvserver/client_replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed/rangefeedcache"
Expand All @@ -35,7 +34,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptutil"
Expand Down Expand Up @@ -2686,118 +2684,6 @@ func TestChangeReplicasGeneration(t *testing.T) {
assert.EqualValues(t, repl.Desc().Generation, oldGeneration+3, "\nold: %+v\nnew: %+v", oldDesc, newDesc)
}

func TestSystemZoneConfigs(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.WithIssue(t, 98905)

// This test is relatively slow and resource intensive. When run under
// stressrace on a loaded machine (as in the nightly tests), sometimes the
// SucceedsSoon conditions below take longer than the allotted time (#25273).
skip.UnderRace(t)
skip.UnderShort(t)
skip.UnderStress(t)

ctx := context.Background()
tc := testcluster.StartTestCluster(t, 7, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
// Disable LBS because when the scan is happening at the rate it's happening
// below, it's possible that one of the system ranges trigger a split.
DisableLoadBasedSplitting: true,
},
},
// This test was written for the gossip-backed SystemConfigSpan
// infrastructure.
DisableSpanConfigs: true,
// Scan like a bat out of hell to ensure replication and replica GC
// happen in a timely manner.
ScanInterval: 50 * time.Millisecond,
},
})
defer tc.Stopper().Stop(ctx)
log.Info(ctx, "TestSystemZoneConfig: test cluster started")

expectedSystemRanges, err := tc.Servers[0].ExpectedInitialRangeCount()
if err != nil {
t.Fatal(err)
}
expectedUserRanges := 1
expectedSystemRanges -= expectedUserRanges
systemNumReplicas := int(*zonepb.DefaultSystemZoneConfig().NumReplicas)
userNumReplicas := int(*zonepb.DefaultZoneConfig().NumReplicas)
expectedReplicas := expectedSystemRanges*systemNumReplicas + expectedUserRanges*userNumReplicas
log.Infof(ctx, "TestSystemZoneConfig: expecting %d system ranges and %d user ranges",
expectedSystemRanges, expectedUserRanges)
log.Infof(ctx, "TestSystemZoneConfig: expected (%dx%d) + (%dx%d) = %d replicas total",
expectedSystemRanges, systemNumReplicas, expectedUserRanges, userNumReplicas, expectedReplicas)

waitForReplicas := func() error {
replicas := make(map[roachpb.RangeID]roachpb.RangeDescriptor)
for _, s := range tc.Servers {
if err := kvstorage.IterateRangeDescriptorsFromDisk(ctx, s.Engines()[0], func(desc roachpb.RangeDescriptor) error {
if len(desc.Replicas().LearnerDescriptors()) > 0 {
return fmt.Errorf("descriptor contains learners: %v", desc)
}
if existing, ok := replicas[desc.RangeID]; ok && !existing.Equal(&desc) {
return fmt.Errorf("mismatch between\n%s\n%s", &existing, &desc)
}
replicas[desc.RangeID] = desc
return nil
}); err != nil {
return err
}
}
var totalReplicas int
for _, desc := range replicas {
totalReplicas += len(desc.Replicas().VoterDescriptors())
}
if totalReplicas != expectedReplicas {
return fmt.Errorf("got %d voters, want %d; details: %+v", totalReplicas, expectedReplicas, replicas)
}
return nil
}

// Wait until we're down to the expected number of replicas. This is
// effectively waiting on replica GC to kick in to destroy any replicas that
// got removed during rebalancing of the initial ranges, since the testcluster
// waits until nothing is underreplicated but not until all rebalancing has
// settled down.
testutils.SucceedsSoon(t, waitForReplicas)
log.Info(ctx, "TestSystemZoneConfig: initial replication succeeded")

// Update the meta zone config to have more replicas and expect the number
// of replicas to go up accordingly after running all replicas through the
// replicate queue.
sqlDB := sqlutils.MakeSQLRunner(tc.ServerConn(0))
sqlutils.SetZoneConfig(t, sqlDB, "RANGE meta", "num_replicas: 7")
expectedReplicas += 2
testutils.SucceedsSoon(t, waitForReplicas)
log.Info(ctx, "TestSystemZoneConfig: up-replication of meta ranges succeeded")

// Do the same thing, but down-replicating the timeseries range.
sqlutils.SetZoneConfig(t, sqlDB, "RANGE timeseries", "num_replicas: 1")
expectedReplicas -= 2
testutils.SucceedsSoon(t, waitForReplicas)
log.Info(ctx, "TestSystemZoneConfig: down-replication of timeseries ranges succeeded")

// Up-replicate the system.jobs table to demonstrate that it is configured
// independently from the system database.
sqlutils.SetZoneConfig(t, sqlDB, "TABLE system.jobs", "num_replicas: 7")
expectedReplicas += 2
testutils.SucceedsSoon(t, waitForReplicas)
log.Info(ctx, "TestSystemZoneConfig: up-replication of jobs table succeeded")

// Finally, verify the system ranges. Note that in a new cluster there are
// two system ranges, which we have to take into account here.
sqlutils.SetZoneConfig(t, sqlDB, "RANGE system", "num_replicas: 7")
expectedReplicas += 4
testutils.SucceedsSoon(t, waitForReplicas)
log.Info(ctx, "TestSystemZoneConfig: up-replication of system ranges succeeded")
}

func TestClearRange(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down

0 comments on commit 8395a21

Please sign in to comment.