-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathrunner.go
700 lines (612 loc) · 20.3 KB
/
runner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package mixedversion
import (
"context"
gosql "database/sql"
"fmt"
"os"
"path"
"path/filepath"
"regexp"
"runtime/debug"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)
type (
// Helper is the struct passed to user-functions providing helper
// functions that mixed-version tests can use.
Helper struct {
ctx context.Context
testContext *Context
// bgCount keeps track of the number of background tasks started
// with `helper.Background()`. The counter is used to generate
// unique log file names.
bgCount int64
runner *testRunner
stepLogger *logger.Logger
}
// backgroundEvent is the struct sent by background steps when they
// finish (successfully or not).
backgroundEvent struct {
Name string
Err error
TriggeredByTest bool
}
backgroundRunner struct {
group ctxgroup.Group
ctx context.Context
events chan backgroundEvent
logger *logger.Logger
stopFuncs []StopFunc
}
testFailure struct {
summarized bool
description string
seed int64
binaryVersions []roachpb.Version
// Cluster versions before and after the failure occurred. Before
// each step is executed, the test runner will cache each node's
// view of the cluster version; after a failure occurs, we'll try
// to read the cluster version from every node again. This context
// is added to the failure message displayed to the user with the
// intention of highlighting whether the cluster version changed
// during the failure, which is useful for test failures that
// happen while the upgrade is finalizing.
clusterVersionsBefore []roachpb.Version
clusterVersionsAfter []roachpb.Version
}
// crdbMonitor is a thin wrapper around the roachtest monitor API
// (cluster.NewMonitor) that produces error events through a channel
// whenever an unexpected node death happens. It also allows us to
// provide an API for test authors to inform the framework that a
// node death is expected if the test performs its own restarts or
// chaos events.
crdbMonitor struct {
once sync.Once
crdbNodes option.NodeListOption
monitor cluster.Monitor
errCh chan error
}
testRunner struct {
ctx context.Context
cancel context.CancelFunc
plan *TestPlan
cluster cluster.Cluster
crdbNodes option.NodeListOption
seed int64
logger *logger.Logger
binaryVersions atomic.Value
clusterVersions atomic.Value
background *backgroundRunner
monitor *crdbMonitor
connCache struct {
mu syncutil.Mutex
cache []*gosql.DB
}
}
)
var (
// everything that is not an alphanum or a few special characters
invalidChars = regexp.MustCompile(`[^a-zA-Z0-9 \-_\.]`)
)
func newTestRunner(
ctx context.Context,
cancel context.CancelFunc,
plan *TestPlan,
l *logger.Logger,
c cluster.Cluster,
crdbNodes option.NodeListOption,
randomSeed int64,
) *testRunner {
return &testRunner{
ctx: ctx,
cancel: cancel,
plan: plan,
logger: l,
cluster: c,
crdbNodes: crdbNodes,
background: newBackgroundRunner(ctx, l),
monitor: newCRDBMonitor(ctx, c, crdbNodes),
seed: randomSeed,
}
}
// run implements the test running logic, which boils down to running
// each step in sequence.
func (tr *testRunner) run() (retErr error) {
stepsErr := make(chan error)
defer func() { tr.teardown(stepsErr, retErr != nil) }()
go func() {
defer close(stepsErr)
for _, step := range tr.plan.steps {
if err := tr.runStep(tr.ctx, step); err != nil {
stepsErr <- err
return
}
}
}()
for {
select {
case err := <-stepsErr:
return err
case event := <-tr.background.CompletedEvents():
if event.Err == nil {
tr.logger.Printf("background step finished: %s", event.Name)
continue
} else if event.TriggeredByTest {
tr.logger.Printf("background step canceled by test: %s", event.Name)
continue
}
return fmt.Errorf("background step `%s` returned error: %w", event.Name, event.Err)
case err := <-tr.monitor.Err():
return tr.testFailure(err.Error(), tr.logger)
}
}
}
// runStep contains the logic of running a single test step, called
// recursively in the case of sequentialRunStep and concurrentRunStep.
func (tr *testRunner) runStep(ctx context.Context, step testStep) error {
if ss, ok := step.(singleStep); ok {
if ss.ID() > tr.plan.startClusterID {
// update the runner's view of the cluster's binary and cluster
// versions before every non-initialization `singleStep` is
// executed
if err := tr.maybeInitConnections(); err != nil {
return err
}
if err := tr.refreshBinaryVersions(); err != nil {
return err
}
if err := tr.refreshClusterVersions(); err != nil {
return err
}
tr.monitor.Init()
}
}
switch s := step.(type) {
case sequentialRunStep:
for _, ss := range s.steps {
if err := tr.runStep(ctx, ss); err != nil {
return err
}
}
return nil
case concurrentRunStep:
group := ctxgroup.WithContext(tr.ctx)
for _, cs := range s.delayedSteps {
cs := cs
group.GoCtx(func(concurrentCtx context.Context) error {
return tr.runStep(concurrentCtx, cs)
})
}
return group.Wait()
case delayedStep:
time.Sleep(s.delay)
return tr.runStep(ctx, s.step)
default:
ss := s.(singleStep)
stepLogger, err := tr.loggerFor(ss)
if err != nil {
return err
}
if stopChan := ss.Background(); stopChan != nil {
tr.startBackgroundStep(ss, stepLogger, stopChan)
return nil
}
return tr.runSingleStep(ctx, ss, stepLogger)
}
}
// runSingleStep takes care of the logic of running a `singleStep`,
// including logging start and finish times, wrapping the error (if
// any) with useful information, and renaming the log file to indicate
// failure. This logic is the same whether running a step in the
// background or not.
func (tr *testRunner) runSingleStep(ctx context.Context, ss singleStep, l *logger.Logger) error {
tr.logStep("STARTING", ss, l)
tr.logVersions(l)
start := timeutil.Now()
defer func() {
prefix := fmt.Sprintf("FINISHED [%s]", timeutil.Since(start))
tr.logStep(prefix, ss, l)
}()
if err := panicAsError(l, func() error {
return ss.Run(ctx, l, tr.cluster, tr.newHelper(ctx, l))
}); err != nil {
if isContextCanceled(ctx) {
l.Printf("step terminated (context canceled)")
// Avoid creating a `stepError` (which involves querying binary
// and cluster versions) when the context was canceled as the
// main error (that caused the context to be canceled) will
// already include relevant information. This context
// cancelation could also be happening because the test author
// is explicitly stopping a background step, so running those
// queries would be wasteful.
return err
}
return tr.stepError(err, ss, l)
}
return nil
}
func (tr *testRunner) startBackgroundStep(ss singleStep, l *logger.Logger, stopChan shouldStop) {
stop := tr.background.Start(ss.Description(), func(ctx context.Context) error {
return tr.runSingleStep(ctx, ss, l)
})
// We start a goroutine to listen for user-requests to stop the
// background function.
go func() {
select {
case <-stopChan:
// Test has requested the background function to stop.
stop()
case <-tr.ctx.Done():
// Parent context is done (test has finished).
return
}
}()
}
// stepError generates a `testFailure` error by augmenting the error
// passed with extra information. Specifically, the error message will
// include the ID of the step that failed, the random seed used, the
// binary version on each node when the error occurred, and the
// cluster version before and after the step (in case the failure
// happened *while* the cluster version was updating).
func (tr *testRunner) stepError(err error, step singleStep, l *logger.Logger) error {
desc := fmt.Sprintf("mixed-version test failure while running step %d (%s): %s",
step.ID(), step.Description(), err,
)
return tr.testFailure(desc, l)
}
// testFailure generates a `testFailure` with the given
// description. It logs the error to the logger passed, and renames
// the underlying file to include the "FAILED" prefix to help in
// debugging.
func (tr *testRunner) testFailure(desc string, l *logger.Logger) error {
clusterVersionsBefore := tr.clusterVersions
var clusterVersionsAfter atomic.Value
if tr.connCacheInitialized() {
if err := tr.refreshClusterVersions(); err != nil {
tr.logger.Printf("failed to fetch cluster versions after failure: %s", err)
} else {
clusterVersionsAfter = tr.clusterVersions
}
}
tf := &testFailure{
description: desc,
seed: tr.seed,
binaryVersions: loadAtomicVersions(tr.binaryVersions),
clusterVersionsBefore: loadAtomicVersions(clusterVersionsBefore),
clusterVersionsAfter: loadAtomicVersions(clusterVersionsAfter),
}
// Print the test failure on the step's logger for convenience, and
// to reduce cross referencing of logs.
l.Printf("%v", tf)
if err := renameFailedLogger(l); err != nil {
tr.logger.Printf("could not rename failed step logger: %v", err)
}
return tf
}
// teardown groups together all tasks that happen once a test finishes.
func (tr *testRunner) teardown(stepsChan chan error, testFailed bool) {
if testFailed {
tr.logger.Printf("mixed-version test FAILED")
} else {
tr.logger.Printf("mixed-version test PASSED")
}
tr.cancel()
// Stop background functions explicitly so that the corresponding
// termination is marked `TriggeredByTest` (not necessary for
// correctness, just for clarity).
tr.logger.Printf("stopping background functions")
tr.background.Terminate()
// If the test failed, we wait for any currently running steps to
// return before passing control back to the roachtest
// framework. This achieves a test.log that does not contain any
// test step output once roachtest started to collect failure
// artifacts, which would be confusing.
if testFailed {
tr.logger.Printf("waiting for all steps to finish after context cancelation")
waitForChannel(stepsChan, "test steps", tr.logger)
}
tr.logger.Printf("closing database connections")
tr.closeConnections()
}
func (tr *testRunner) logStep(prefix string, step singleStep, l *logger.Logger) {
dashes := strings.Repeat("-", 10)
l.Printf("%[1]s %s (%d): %s %[1]s", dashes, prefix, step.ID(), step.Description())
}
// logVersions writes the current cached versions of the binary and
// cluster versions on each node. The cached versions should exist for
// all steps but the first one (when we start the cluster itself).
func (tr *testRunner) logVersions(l *logger.Logger) {
binaryVersions := loadAtomicVersions(tr.binaryVersions)
clusterVersions := loadAtomicVersions(tr.clusterVersions)
if binaryVersions == nil || clusterVersions == nil {
return
}
l.Printf("binary versions: %s", formatVersions(binaryVersions))
l.Printf("cluster versions: %s", formatVersions(clusterVersions))
}
// loggerFor creates a logger instance to be used by a test step. Logs
// will be available under `mixed-version-test/{ID}.log`, making it
// easy to go from the IDs displayed in the test plan to the
// corresponding output of that step.
func (tr *testRunner) loggerFor(step singleStep) (*logger.Logger, error) {
name := invalidChars.ReplaceAllString(strings.ToLower(step.Description()), "")
name = fmt.Sprintf("%d_%s", step.ID(), name)
prefix := path.Join(logPrefix, name)
return prefixedLogger(tr.logger, prefix)
}
// refreshBinaryVersions updates the internal `binaryVersions` field
// with the binary version running on each node of the cluster. We use
// the `atomic` package here as this function may be called by two
// steps that are running concurrently.
func (tr *testRunner) refreshBinaryVersions() error {
newBinaryVersions := make([]roachpb.Version, 0, len(tr.crdbNodes))
for _, node := range tr.crdbNodes {
bv, err := clusterupgrade.BinaryVersion(tr.conn(node))
if err != nil {
return fmt.Errorf("failed to get binary version for node %d: %w", node, err)
}
newBinaryVersions = append(newBinaryVersions, bv)
}
tr.binaryVersions.Store(newBinaryVersions)
return nil
}
// refreshClusterVersions updates the internal `clusterVersions` field
// with the current view of the cluster version in each of the nodes
// of the cluster.
func (tr *testRunner) refreshClusterVersions() error {
newClusterVersions := make([]roachpb.Version, 0, len(tr.crdbNodes))
for _, node := range tr.crdbNodes {
cv, err := clusterupgrade.ClusterVersion(tr.ctx, tr.conn(node))
if err != nil {
return fmt.Errorf("failed to get cluster version for node %d: %w", node, err)
}
newClusterVersions = append(newClusterVersions, cv)
}
tr.clusterVersions.Store(newClusterVersions)
return nil
}
// maybeInitConnections initialize connections if the connection cache
// is empty. When the function returns, either the `connCache` field
// is populated with a connection for every crdb node, or the field is
// left untouched, and an error is returned.
func (tr *testRunner) maybeInitConnections() error {
tr.connCache.mu.Lock()
defer tr.connCache.mu.Unlock()
if tr.connCache.cache != nil {
return nil
}
cc := make([]*gosql.DB, len(tr.crdbNodes))
for _, node := range tr.crdbNodes {
conn, err := tr.cluster.ConnE(tr.ctx, tr.logger, node)
if err != nil {
return fmt.Errorf("failed to connect to node %d: %w", node, err)
}
cc[node-1] = conn
}
tr.connCache.cache = cc
return nil
}
func (tr *testRunner) connCacheInitialized() bool {
tr.connCache.mu.Lock()
defer tr.connCache.mu.Unlock()
return tr.connCache.cache != nil
}
func (tr *testRunner) newHelper(ctx context.Context, l *logger.Logger) *Helper {
return &Helper{
ctx: ctx,
runner: tr,
stepLogger: l,
}
}
// conn returns a database connection to the given node. Assumes the
// connection cache has been previously initialized.
func (tr *testRunner) conn(node int) *gosql.DB {
tr.connCache.mu.Lock()
defer tr.connCache.mu.Unlock()
return tr.connCache.cache[node-1]
}
func (tr *testRunner) closeConnections() {
tr.connCache.mu.Lock()
defer tr.connCache.mu.Unlock()
for _, db := range tr.connCache.cache {
if db != nil {
_ = db.Close()
}
}
}
func newCRDBMonitor(
ctx context.Context, c cluster.Cluster, crdbNodes option.NodeListOption,
) *crdbMonitor {
return &crdbMonitor{
crdbNodes: crdbNodes,
monitor: c.NewMonitor(ctx, crdbNodes),
errCh: make(chan error),
}
}
// Init must be called once the cluster is initialized and the
// cockroach process is running on the nodes. Init is idempotent.
func (cm *crdbMonitor) Init() {
cm.once.Do(func() {
go func() {
if err := cm.monitor.WaitE(); err != nil {
cm.errCh <- err
}
}()
})
}
// Err returns a channel that will receive errors whenever an
// unexpected node death is observed.
func (cm *crdbMonitor) Err() chan error {
return cm.errCh
}
func (cm *crdbMonitor) ExpectDeaths(n int) {
cm.monitor.ExpectDeaths(int32(n))
}
func newBackgroundRunner(ctx context.Context, l *logger.Logger) *backgroundRunner {
g := ctxgroup.WithContext(ctx)
return &backgroundRunner{
group: g,
ctx: ctx,
logger: l,
events: make(chan backgroundEvent),
}
}
// Start will run the function `fn` in a goroutine. Any errors
// returned by that function are observable by reading from the
// channel returned by the `Events()` function. Returns a function
// that can be called to stop the background function (canceling the
// context passed to it).
func (br *backgroundRunner) Start(name string, fn func(context.Context) error) context.CancelFunc {
bgCtx, cancel := context.WithCancel(br.ctx)
var expectedContextCancelation bool
br.group.Go(func() error {
err := fn(bgCtx)
event := backgroundEvent{
Name: name,
Err: err,
TriggeredByTest: err != nil && isContextCanceled(bgCtx) && expectedContextCancelation,
}
select {
case br.events <- event:
// exit goroutine
case <-br.ctx.Done():
// Test already finished, exit goroutine.
return nil
}
return err
})
stopBgFunc := func() {
expectedContextCancelation = true
cancel()
}
// Collect all stopFuncs so that we can explicitly stop all
// background functions when the test finishes.
br.stopFuncs = append(br.stopFuncs, stopBgFunc)
return stopBgFunc
}
// Terminate will call the stop functions for every background function
// started during the test. This includes background functions created
// during test runtime (using `helper.Background()`), as well as
// background steps declared in the test setup (using
// `BackgroundFunc`, `Workload`, et al). Returns when all background
// functions have returned.
func (br *backgroundRunner) Terminate() {
for _, stop := range br.stopFuncs {
stop()
}
doneCh := make(chan error)
go func() {
defer close(doneCh)
_ = br.group.Wait()
}()
waitForChannel(doneCh, "background functions", br.logger)
}
func (br *backgroundRunner) CompletedEvents() <-chan backgroundEvent {
return br.events
}
func (tf *testFailure) Error() string {
if tf.summarized {
return tf.description
}
tf.summarized = true
debugInfo := func(label, value string) string {
return fmt.Sprintf("%-40s%s", label+":", value)
}
seedInfo := debugInfo("test random seed", strconv.FormatInt(tf.seed, 10))
binaryVersions := debugInfo("binary versions", formatVersions(tf.binaryVersions))
clusterVersionsBefore := debugInfo(
"cluster versions before failure",
formatVersions(tf.clusterVersionsBefore),
)
var clusterVersionsAfter string
if cv := tf.clusterVersionsAfter; cv != nil {
clusterVersionsBefore += "\n"
clusterVersionsAfter = debugInfo("cluster versions after failure", formatVersions(cv))
}
return fmt.Sprintf(
"%s\n%s\n%s\n%s%s",
tf.description, seedInfo, binaryVersions, clusterVersionsBefore, clusterVersionsAfter,
)
}
func renameFailedLogger(l *logger.Logger) error {
currentFileName := l.File.Name()
newLogName := path.Join(
filepath.Dir(currentFileName),
"FAILED_"+filepath.Base(currentFileName),
)
return os.Rename(currentFileName, newLogName)
}
func loadAtomicVersions(v atomic.Value) []roachpb.Version {
if v.Load() == nil {
return nil
}
return v.Load().([]roachpb.Version)
}
// panicAsError ensures that the any panics that might happen while
// the function passed runs are captured and returned as regular
// errors. A stack trace is included in the logs when that happens to
// facilitate debugging.
func panicAsError(l *logger.Logger, f func() error) (retErr error) {
defer func() {
if r := recover(); r != nil {
l.Printf("panic stack trace:\n%s", string(debug.Stack()))
retErr = fmt.Errorf("panic (stack trace above): %v", r)
}
}()
return f()
}
// waitForChannel waits for the given channel `ch` to close; returns
// when that happens. If the channel does not close within 5 minutes,
// the function logs a message and returns.
//
// The main use-case for this function is waiting for user-provided
// hooks to return after the context passed to them is canceled. We
// want to allow some time for them to finish, but we also don't want
// to block indefinitely if a function inadvertently ignores context
// cancelation.
func waitForChannel(ch chan error, desc string, l *logger.Logger) {
maxWait := 5 * time.Minute
select {
case <-ch:
// return
case <-time.After(maxWait):
l.Printf("waited for %s for %s to finish, giving up", maxWait, desc)
}
}
func formatVersions(versions []roachpb.Version) string {
var pairs []string
for idx, version := range versions {
pairs = append(pairs, fmt.Sprintf("%d: %s", idx+1, version))
}
return fmt.Sprintf("[%s]", strings.Join(pairs, ", "))
}
// isContextCanceled returns a boolean indicating whether the context
// passed is canceled.
func isContextCanceled(ctx context.Context) bool {
select {
case <-ctx.Done():
return true
default:
return false
}
}