-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathconfig.go
839 lines (749 loc) · 36.1 KB
/
config.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package base
import (
"context"
"math"
"math/big"
"net/url"
"os"
"time"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/redact"
)
// Base config defaults.
//
// When changing these, TestDefaultRaftConfig must also be updated via -rewrite,
// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier
// comment with any adjustments to the surrounding reasoning.
const (
defaultInsecure = false
defaultUser = username.RootUser
httpScheme = "http"
httpsScheme = "https"
// From IANA Service Name and Transport Protocol Port Number Registry. See
// https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=cockroachdb
//
// This is used for both RPC and SQL connections unless --sql-addr
// is used on the command line and/or SQLAddr is set in the Config object.
DefaultPort = "26257"
// The default port for HTTP-for-humans.
DefaultHTTPPort = "8080"
// NB: net.JoinHostPort is not a constant.
defaultAddr = ":" + DefaultPort
defaultSQLAddr = ":" + DefaultPort
defaultHTTPAddr = ":" + DefaultHTTPPort
// NB: this can't easily become a variable as the UI hard-codes it to 10s.
// See https://github.com/cockroachdb/cockroach/issues/20310.
DefaultMetricsSampleInterval = 10 * time.Second
// defaultRangeLeaseRenewalFraction specifies what fraction the range lease
// renewal duration should be of the range lease active time. For example,
// with a value of 0.2 and a lease duration of 10 seconds, leases would be
// eagerly renewed 8 seconds into each lease.
//
// A range lease extension requires 1 RTT (Raft consensus), assuming the
// leaseholder is colocated with the Raft leader, so 3 seconds should be
// sufficient (see NetworkTimeout). However, on user ranges, Raft consensus
// uses the DefaultClass RPC class, and is thus subject to head-of-line
// blocking by other RPC traffic which can cause very high latencies under
// heavy load (several seconds).
defaultRangeLeaseRenewalFraction = 0.5
// livenessRenewalFraction specifies what fraction the node liveness renewal
// duration should be of the node liveness duration. For example, with a value
// of 0.2 and a liveness duration of 10 seconds, each node's liveness record
// would be eagerly renewed after 8 seconds.
//
// A liveness record write requires 2 RTTs (RPC and Raft consensus). Assuming
// a max RTT of 600ms (see NetworkTimeout), 3 seconds is enough for 2 RTTs
// (2*600ms) and 1 RTO (900ms), with a 900ms buffer. The write is committed
// 1/2 RTT before this. Liveness RPCs including Raft messages are sent via
// SystemClass, and thus avoid head-of-line blocking by general RPC traffic.
livenessRenewalFraction = 0.5
// DefaultDescriptorLeaseDuration is the default mean duration a
// lease will be acquired for. The actual duration is jittered using
// the jitter fraction. Jittering is done to prevent multiple leases
// from being renewed simultaneously if they were all acquired
// simultaneously.
DefaultDescriptorLeaseDuration = 5 * time.Minute
// DefaultDescriptorLeaseJitterFraction is the default factor
// that we use to randomly jitter the lease duration when acquiring a
// new lease and the lease renewal timeout.
DefaultDescriptorLeaseJitterFraction = 0.05
// DefaultDescriptorLeaseRenewalTimeout is the default time
// before a lease expires when acquisition to renew the lease begins.
DefaultDescriptorLeaseRenewalTimeout = time.Minute
// DefaultLeaseRenewalCrossValidate is the default setting for if
// we should validate descriptors on lease renewals.
DefaultLeaseRenewalCrossValidate = false
)
// DefaultCertsDirectory is the default value for the cert directory flag.
var DefaultCertsDirectory = os.ExpandEnv("${HOME}/.cockroach-certs")
// DefaultHistogramWindowInterval returns the default rotation window for
// histograms.
func DefaultHistogramWindowInterval() time.Duration {
const defHWI = 6 * DefaultMetricsSampleInterval
// Rudimentary overflow detection; this can result if
// DefaultMetricsSampleInterval is set to an extremely large number, likely
// in the context of a test or an intentional attempt to disable metrics
// collection. Just return the default in this case.
if defHWI < DefaultMetricsSampleInterval {
return DefaultMetricsSampleInterval
}
return defHWI
}
var (
// NetworkTimeout is the timeout used for network operations that require a
// single network round trip. It is conservatively defined as one maximum
// network round trip time (RTT) plus one TCP packet retransmit (RTO) with an
// additional safety margin.
//
// The maximum RTT between cloud regions is roughly 400ms both in GCP
// (asia-south2 to southamerica-west1) and AWS (af-south-1 to sa-east-1), but
// p99 RTTs can occasionally approach 600ms.
// https://datastudio.google.com/reporting/fc733b10-9744-4a72-a502-92290f608571/page/70YCB
// https://www.cloudping.co/grid/p_99/timeframe/1W
//
// Linux has an RTT-dependant retransmission timeout (RTO) which we can
// approximate as 1.5x RTT (smoothed RTT + 4x RTT variance), with a lower
// bound of 200ms. It can thus be up to 900ms in the worst case.
//
// The NetworkTimeout is therefore set to 2 seconds: 600ms RTT plus 900ms RTO
// plus a 500ms safety margin.
NetworkTimeout = envutil.EnvOrDefaultDuration("COCKROACH_NETWORK_TIMEOUT", 2*time.Second)
// DialTimeout is the timeout used when dialing a node. gRPC connections take
// up to 3 roundtrips for the TCP + TLS handshakes. NetworkTimeout allows for
// both a network roundtrip and a TCP retransmit, but we don't need to
// tolerate more than 1 retransmit per connection attempt, so
// 2 * NetworkTimeout is sufficient.
DialTimeout = 2 * NetworkTimeout
// PingInterval is the interval between network heartbeat pings. It is used
// both for RPC heartbeat intervals and gRPC server keepalive pings. It is
// set to 1 second in order to fail fast, but with large default timeouts
// to tolerate high-latency multiregion clusters.
PingInterval = envutil.EnvOrDefaultDuration("COCKROACH_PING_INTERVAL", time.Second)
// defaultRangeLeaseDuration specifies the default range lease duration.
//
// Timers for Raft leadership election and lease expiration run in parallel.
// Although not required, we would like to elect a leader before the lease
// expires, such that we don't have to wait for a Raft election when we're
// ready to acquire the lease.
//
// The relevant operations and default time intervals are listed below. RTTs
// are assumed to range from 10ms to 400ms (see NetworkTimeout). Heartbeat
// offsets refer to the duration from the last heartbeat to the node crash
// (e.g. with a 1s heartbeat interval and 3s timeout, if a node crashes 1s
// after heartbeating then the timeout fires after 2s of unavailability and
// the offset is -1s).
//
// Raft election:
// - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s]
// - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s]
// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s]
// Total latency [ 1.03s - 5.20s]
//
// Lease acquisition:
// - Heartbeat offset (0-1 heartbeat interval) [-3.00s - 0.00s]
// - Lease expiration (constant) [ 6.00s - 6.00s]
// - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s]
// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s]
// Total latency [ 3.03s - 7.20s]
//
// (generated by TestDefaultRaftConfig)
//
// From the above, we note that the worst-case Raft election latency
// (4.03s-5.20s) is always less than the corresponding lease expiration +
// epoch bump time (6.02s-6.80s) regardless of RTT, such that the upper bound
// on unavailability is always given by the lease expiration time + 3x RTT
// (6.03s to 7.20s). With negligible RTT, the average latency is 4.5s for
// lease acquisition and 2.5s for Raft elections.
defaultRangeLeaseDuration = envutil.EnvOrDefaultDuration(
"COCKROACH_RANGE_LEASE_DURATION", 6*time.Second)
// defaultRPCHeartbeatTimeout is the default RPC heartbeat timeout. It is set
// very high at 3 * NetworkTimeout for several reasons: the gRPC transport may
// need to complete a dial/handshake before sending the heartbeat, the
// heartbeat has occasionally been seen to require 3 RTTs even post-dial (for
// unknown reasons), and under load the heartbeat may be head-of-line blocked
// by other RPC traffic.
//
// High-latency experiments with 6s RPC heartbeat timeouts showed that
// clusters running TPCC imports were stable at 400ms RTT, but started seeing
// RPC heartbeat failures at 500ms RTT. With light load (e.g. rate-limited
// kv50), clusters were stable at 1000ms RTT.
//
// The maximum p99 RPC heartbeat latency in any Cockroach Cloud cluster over a
// 90-day period was found to be 557ms. This was a single-region US cluster
// where the latency was caused by CPU overload.
//
// TODO(erikgrinaker): We should avoid head-of-line blocking for RPC
// heartbeats and reduce this to NetworkTimeout (plus DialTimeout for the
// initial heartbeat), see:
// https://github.com/cockroachdb/cockroach/issues/93397.
defaultRPCHeartbeatTimeout = 3 * NetworkTimeout
// defaultRaftTickInterval is the default resolution of the Raft timer.
defaultRaftTickInterval = envutil.EnvOrDefaultDuration(
"COCKROACH_RAFT_TICK_INTERVAL", 500*time.Millisecond)
// defaultRaftHeartbeatIntervalTicks is the default value for
// RaftHeartbeatIntervalTicks, which determines the number of ticks between
// each heartbeat.
defaultRaftHeartbeatIntervalTicks = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_HEARTBEAT_INTERVAL_TICKS", 2)
// defaultRaftElectionTimeoutTicks specifies the minimum number of Raft ticks
// before holding an election. The actual election timeout per replica is
// multiplied by a random factor of 1-2, to avoid ties.
//
// A timeout of 2 seconds with a Raft heartbeat sent every second gives each
// heartbeat 1 second to make it across the network. This is only half a
// network roundtrip, and as seen in NetworkTimeout 1 second is generally
// sufficient for a full network roundtrip. Raft heartbeats are also sent via
// SystemClass, avoiding head-of-line blocking by general RPC traffic. The 1-2
// random factor provides an additional buffer.
defaultRaftElectionTimeoutTicks = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 4)
// defaultRaftReproposalTimeoutTicks is the number of ticks before reproposing
// a Raft command.
defaultRaftReproposalTimeoutTicks = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_REPROPOSAL_TIMEOUT_TICKS", 6)
// defaultRaftLogTruncationThreshold specifies the upper bound that a single
// Range's Raft log can grow to before log truncations are triggered while at
// least one follower is missing. If all followers are active, the quota pool
// is responsible for ensuring the raft log doesn't grow without bound by
// making sure the leader doesn't get too far ahead.
defaultRaftLogTruncationThreshold = envutil.EnvOrDefaultInt64(
"COCKROACH_RAFT_LOG_TRUNCATION_THRESHOLD", 16<<20 /* 16 MB */)
// defaultRaftMaxSizePerMsg specifies the maximum aggregate byte size of Raft
// log entries that a leader will send to followers in a single MsgApp.
defaultRaftMaxSizePerMsg = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_MAX_SIZE_PER_MSG", 32<<10 /* 32 KB */)
// defaultRaftMaxSizeCommittedSizePerReady specifies the maximum aggregate
// byte size of the committed log entries which a node will receive in a
// single Ready.
defaultRaftMaxCommittedSizePerReady = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_MAX_COMMITTED_SIZE_PER_READY", 64<<20 /* 64 MB */)
// defaultRaftMaxInflightMsgs specifies how many "inflight" MsgApps a leader
// will send to a given follower without hearing a response.
defaultRaftMaxInflightMsgs = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_MAX_INFLIGHT_MSGS", 128)
// defaultRaftMaxInflightBytes specifies the maximum aggregate byte size of
// Raft log entries that a leader will send to a follower without hearing
// responses.
//
// Previously it was assumed that RaftMaxInflightMsgs * RaftMaxSizePerMsg is a
// proxy for the actual max inflight traffic. However, RaftMaxSizePerMsg is
// not a hard limit, it's rather a "target" size for the message, so the
// actual inflight bytes could exceed this product by a large factor.
// RaftMaxInflightBytes is a more accurate limit, and should be used in
// conjunction with the two.
//
// TODO(#90314): lower this limit to something close to max rates observed in
// healthy clusters. Currently, this is a conservatively large multiple of
// defaultRaftMaxInflightMsgs * defaultRaftMaxSizePerMsg, so that we don't
// abruptly break the previous assumption and cut off traffic.
defaultRaftMaxInflightBytes = envutil.EnvOrDefaultBytes(
"COCKROACH_RAFT_MAX_INFLIGHT_BYTES", 256<<20 /* 256 MB */)
)
// Config is embedded by server.Config. A base config is not meant to be used
// directly, but embedding configs should call cfg.InitDefaults().
type Config struct {
// Insecure specifies whether to disable security checks throughout
// the code base.
// This is really not recommended.
// See: https://github.com/cockroachdb/cockroach/issues/53404
Insecure bool
// AcceptSQLWithoutTLS, when set, makes it possible for SQL
// clients to authenticate without TLS on a secure cluster.
//
// Authentication is, as usual, subject to the HBA configuration: in
// the default case, password authentication is still mandatory.
AcceptSQLWithoutTLS bool
// SSLCAKey is used to sign new certs.
SSLCAKey string
// SSLCertsDir is the path to the certificate/key directory.
SSLCertsDir string
// User running this process. It could be the user under which
// the server is running or the user passed in client calls.
User username.SQLUsername
// Addr is the address the server is listening on.
Addr string
// AdvertiseAddr is the address advertised by the server to other nodes
// in the cluster. It should be reachable by all other nodes and should
// route to an interface that Addr is listening on.
AdvertiseAddr string
// ClusterName is the name used as a sanity check when a node joins
// an uninitialized cluster, or when an uninitialized node joins an
// initialized cluster. The initial RPC handshake verifies that the
// name matches on both sides. Once the cluster ID has been
// negotiated on both sides, the cluster name is not used any more.
ClusterName string
// DisableClusterNameVerification, when set, alters the cluster name
// verification to only verify that a non-empty cluster name on
// both sides match. This is meant for use while rolling an
// existing cluster into using a new cluster name.
DisableClusterNameVerification bool
// SplitListenSQL indicates whether to listen for SQL
// clients on a separate address from RPC requests.
SplitListenSQL bool
// SQLAddr is the configured SQL listen address.
// This is used if SplitListenSQL is set to true.
SQLAddr string
// SQLAdvertiseAddr is the advertised SQL address.
// This is computed from SQLAddr if specified otherwise Addr.
SQLAdvertiseAddr string
// SocketFile, if non-empty, sets up a TLS-free local listener using
// a unix datagram socket at the specified path for SQL clients.
// This is auto-populated from SQLAddr if it initially ends with '.0'.
SocketFile string
// HTTPAddr is the configured HTTP listen address.
HTTPAddr string
// DisableTLSForHTTP, if set, disables TLS for the HTTP listener.
DisableTLSForHTTP bool
// HTTPAdvertiseAddr is the advertised HTTP address.
// This is computed from HTTPAddr if specified otherwise Addr.
HTTPAdvertiseAddr string
// RPCHeartbeatInterval controls how often Ping requests are sent on peer
// connections to determine connection health and update the local view of
// remote clocks.
RPCHeartbeatInterval time.Duration
// RPCHearbeatTimeout is the timeout for Ping requests.
RPCHeartbeatTimeout time.Duration
// SecondaryTenantPortOffset is the increment to add to the various
// addresses to generate the network configuration for the in-memory
// secondary tenant. If set to zero (the default), ports are
// auto-allocated randomly.
// TODO(knz): Remove this mechanism altogether in favor of a single
// network listener with protocol routing.
// See: https://github.com/cockroachdb/cockroach/issues/84585
SecondaryTenantPortOffset int
// Enables the use of an PTP hardware clock user space API for HLC current time.
// This contains the path to the device to be used (i.e. /dev/ptp0)
ClockDevicePath string
// AutoInitializeCluster, if set, causes the server to bootstrap the
// cluster. Note that if two nodes are started with this flag set
// and also configured to join each other, each node will bootstrap
// its own unique cluster and the join will fail.
//
// The flag exists mostly for the benefit of tests, and for
// `cockroach start-single-node`.
AutoInitializeCluster bool
// LocalityAddresses contains private IP addresses that can only be accessed
// in the corresponding locality.
LocalityAddresses []roachpb.LocalityAddress
}
// HistogramWindowInterval is used to determine the approximate length of time
// that individual samples are retained in in-memory histograms. Currently,
// it is set to the arbitrary length of six times the Metrics sample interval.
//
// The length of the window must be longer than the sampling interval due to
// issue #12998, which was causing histograms to return zero values when sampled
// because all samples had been evicted.
//
// Note that this is only intended to be a temporary fix for the above issue,
// as our current handling of metric histograms have numerous additional
// problems. These are tracked in github issue #7896, which has been given
// a relatively high priority in light of recent confusion around histogram
// metrics. For more information on the issues underlying our histogram system
// and the proposed fixes, please see issue #7896.
func (*Config) HistogramWindowInterval() time.Duration {
return DefaultHistogramWindowInterval()
}
// InitDefaults sets up the default values for a config.
// This is also used in tests to reset global objects.
func (cfg *Config) InitDefaults() {
cfg.Insecure = defaultInsecure
cfg.User = username.MakeSQLUsernameFromPreNormalizedString(defaultUser)
cfg.Addr = defaultAddr
cfg.AdvertiseAddr = cfg.Addr
cfg.HTTPAddr = defaultHTTPAddr
cfg.DisableTLSForHTTP = false
cfg.HTTPAdvertiseAddr = ""
cfg.SplitListenSQL = false
cfg.SQLAddr = defaultSQLAddr
cfg.SQLAdvertiseAddr = cfg.SQLAddr
cfg.SocketFile = ""
cfg.SSLCertsDir = DefaultCertsDirectory
cfg.RPCHeartbeatInterval = PingInterval
cfg.RPCHeartbeatTimeout = defaultRPCHeartbeatTimeout
cfg.ClusterName = ""
cfg.DisableClusterNameVerification = false
cfg.ClockDevicePath = ""
cfg.AcceptSQLWithoutTLS = false
cfg.SecondaryTenantPortOffset = 0
}
// HTTPRequestScheme returns "http" or "https" based on the value of
// Insecure and DisableTLSForHTTP.
func (cfg *Config) HTTPRequestScheme() string {
if cfg.Insecure || cfg.DisableTLSForHTTP {
return httpScheme
}
return httpsScheme
}
// AdminURL returns the URL for the admin UI.
func (cfg *Config) AdminURL() *url.URL {
return &url.URL{
Scheme: cfg.HTTPRequestScheme(),
Host: cfg.HTTPAdvertiseAddr,
}
}
// RaftConfig holds raft tuning parameters.
type RaftConfig struct {
// RaftTickInterval is the resolution of the Raft timer.
RaftTickInterval time.Duration
// RaftElectionTimeoutTicks is the minimum number of raft ticks before holding
// an election. The actual election timeout is randomized by each replica to
// between 1-2 election timeouts. This value is inherited by individual stores
// unless overridden.
RaftElectionTimeoutTicks int
// RaftReproposalTimeoutTicks is the number of ticks before reproposing a Raft
// command. This also specifies the number of ticks between each reproposal
// check, so the actual timeout is 1-2 times this value.
RaftReproposalTimeoutTicks int
// RaftHeartbeatIntervalTicks is the number of ticks that pass between heartbeats.
RaftHeartbeatIntervalTicks int
// RangeLeaseRaftElectionTimeoutMultiplier specifies the range lease duration.
RangeLeaseDuration time.Duration
// RangeLeaseRenewalFraction specifies what fraction the range lease renewal
// duration should be of the range lease active time. For example, with a
// value of 0.2 and a lease duration of 10 seconds, leases would be eagerly
// renewed 8 seconds into each lease. A value of zero means use the default
// and a value of -1 means never preemptively renew the lease. A value of 1
// means always renew.
RangeLeaseRenewalFraction float64
// RaftLogTruncationThreshold controls how large a single Range's Raft log
// can grow. When a Range's Raft log grows above this size, the Range will
// begin performing log truncations.
RaftLogTruncationThreshold int64
// RaftProposalQuota controls the maximum aggregate size of Raft commands
// that a leader is allowed to propose concurrently.
//
// By default, the quota is set to a fraction of the Raft log truncation
// threshold. In doing so, we ensure all replicas have sufficiently up to
// date logs so that when the log gets truncated, the followers do not need
// non-preemptive snapshots. Changing this deserves care. Too low and
// everything comes to a grinding halt, too high and we're not really
// throttling anything (we'll still generate snapshots).
RaftProposalQuota int64
// RaftMaxUncommittedEntriesSize controls how large the uncommitted tail of
// the Raft log can grow. The limit is meant to provide protection against
// unbounded Raft log growth when quorum is lost and entries stop being
// committed but continue to be proposed.
RaftMaxUncommittedEntriesSize uint64
// RaftMaxSizePerMsg controls the maximum aggregate byte size of Raft log
// entries the leader will send to followers in a single MsgApp. Smaller
// value lowers the raft recovery cost (during initial probing and after
// message loss during normal operation). On the other hand, it limits the
// throughput during normal replication.
//
// Used in combination with RaftMaxInflightMsgs and RaftMaxInflightBytes.
RaftMaxSizePerMsg uint64
// RaftMaxCommittedSizePerReady controls the maximum aggregate byte size of
// committed Raft log entries a replica will receive in a single Ready.
RaftMaxCommittedSizePerReady uint64
// RaftMaxInflightMsgs controls how many "inflight" MsgApps Raft will send to
// a follower without hearing a response. The total size of inflight Raft log
// entries is thus roughly limited by RaftMaxInflightMsgs * RaftMaxSizePerMsg,
// but also by RaftMaxInflightBytes. The current default settings provide for
// up to 4 MB of Raft log to be sent without acknowledgement. With an average
// entry size of 1 KB that translates to ~4096 commands that might be executed
// in the handling of a single raft.Ready operation.
//
// This setting is used both by sending and receiving end of Raft messages. To
// minimize dropped messages on the receiver, its size should at least match
// the sender's (being it the default size, or taken from the env variables).
RaftMaxInflightMsgs int
// RaftMaxInflightBytes controls the maximum aggregate byte size of Raft log
// entries that a leader will send to a follower without hearing responses.
//
// Normally RaftMaxSizePerMsg * RaftMaxInflightMsgs is the actual limit. But
// the RaftMaxSizePerMsg is soft, and Raft may send individual messages
// arbitrarily larger than that (e.g. with a large write, or AddSST command),
// so it's possible that the overall limit is exceeded by a large multiple.
// RaftMaxInflightBytes is a stricter limit which can only be slightly
// exceeded (by a single message).
//
// This effectively bounds the bandwidth-delay product. Note that especially
// in high-latency deployments setting this too low can lead to a dramatic
// reduction in throughput. For example, with a peer that has a round-trip
// latency of 100ms to the leader and this setting is set to 1 MB, there is a
// throughput limit of 10 MB/s for this group. With RTT of 400ms, this drops
// to 2.5 MB/s. See Little's law to understand the maths behind.
RaftMaxInflightBytes uint64
// Splitting a range which has a replica needing a snapshot results in two
// ranges in that state. The delay configured here slows down splits when in
// that situation (limiting to those splits not run through the split
// queue). The most important target here are the splits performed by
// backup/restore.
//
// -1 to disable.
RaftDelaySplitToSuppressSnapshot time.Duration
}
// SetDefaults initializes unset fields.
func (cfg *RaftConfig) SetDefaults() {
if cfg.RaftTickInterval == 0 {
cfg.RaftTickInterval = defaultRaftTickInterval
}
if cfg.RaftElectionTimeoutTicks == 0 {
cfg.RaftElectionTimeoutTicks = defaultRaftElectionTimeoutTicks
}
if cfg.RaftHeartbeatIntervalTicks == 0 {
cfg.RaftHeartbeatIntervalTicks = defaultRaftHeartbeatIntervalTicks
}
if cfg.RangeLeaseDuration == 0 {
cfg.RangeLeaseDuration = defaultRangeLeaseDuration
}
if cfg.RangeLeaseRenewalFraction == 0 {
cfg.RangeLeaseRenewalFraction = defaultRangeLeaseRenewalFraction
}
if cfg.RaftReproposalTimeoutTicks == 0 {
cfg.RaftReproposalTimeoutTicks = defaultRaftReproposalTimeoutTicks
}
// TODO(andrei): -1 is a special value for RangeLeaseRenewalFraction which
// really means "0" (never renew), except that the zero value means "use
// default". We can't turn the -1 into 0 here because, unfortunately,
// SetDefaults is called multiple times (see NewStore()). So, we leave -1
// alone and ask all the users to handle it specially.
if cfg.RaftLogTruncationThreshold == 0 {
cfg.RaftLogTruncationThreshold = defaultRaftLogTruncationThreshold
}
if cfg.RaftProposalQuota == 0 {
// By default, set this to a fraction of RaftLogMaxSize. See the comment
// on the field for the tradeoffs of setting this higher or lower.
cfg.RaftProposalQuota = cfg.RaftLogTruncationThreshold / 2
}
if cfg.RaftMaxUncommittedEntriesSize == 0 {
// By default, set this to twice the RaftProposalQuota. The logic here
// is that the quotaPool should be responsible for throttling proposals
// in all cases except for unbounded Raft re-proposals because it queues
// efficiently instead of dropping proposals on the floor indiscriminately.
cfg.RaftMaxUncommittedEntriesSize = uint64(2 * cfg.RaftProposalQuota)
}
if cfg.RaftMaxSizePerMsg == 0 {
cfg.RaftMaxSizePerMsg = uint64(defaultRaftMaxSizePerMsg)
}
if cfg.RaftMaxCommittedSizePerReady == 0 {
cfg.RaftMaxCommittedSizePerReady = uint64(defaultRaftMaxCommittedSizePerReady)
}
if cfg.RaftMaxInflightMsgs == 0 {
cfg.RaftMaxInflightMsgs = defaultRaftMaxInflightMsgs
}
if cfg.RaftMaxInflightBytes == 0 {
cfg.RaftMaxInflightBytes = uint64(defaultRaftMaxInflightBytes)
}
// Fixup RaftMaxInflightBytes if it is lower than reasonable.
if other := maxInflightBytesFrom(
cfg.RaftMaxInflightMsgs, cfg.RaftMaxSizePerMsg,
); cfg.RaftMaxInflightBytes < other {
cfg.RaftMaxInflightBytes = other
}
if cfg.RaftDelaySplitToSuppressSnapshot == 0 {
// Use a generous delay to make sure even a backed up Raft snapshot queue is
// going to make progress when a (not overly concurrent) amount of splits
// happens. The generous amount should result in a delay sufficient to
// transmit at least one snapshot with the slow delay, which with default
// settings is max 512MB at 32MB/s, ie 16 seconds.
cfg.RaftDelaySplitToSuppressSnapshot = 45 * time.Second
}
// Minor validation to ensure sane tuning.
if cfg.RaftProposalQuota > int64(cfg.RaftMaxUncommittedEntriesSize) {
panic("raft proposal quota should not be above max uncommitted entries size")
}
}
// RaftElectionTimeout returns the raft election timeout, as computed from the
// tick interval and number of election timeout ticks.
func (cfg RaftConfig) RaftElectionTimeout() time.Duration {
return time.Duration(cfg.RaftElectionTimeoutTicks) * cfg.RaftTickInterval
}
// RangeLeaseDurations computes durations for range lease expiration and
// renewal.
func (cfg RaftConfig) RangeLeaseDurations() (time.Duration, time.Duration) {
return cfg.RangeLeaseDuration, cfg.RangeLeaseRenewalDuration()
}
// RangeLeaseRenewalDuration specifies a time interval at the end of the
// active lease interval (i.e. bounded to the right by the start of the stasis
// period) during which operations will trigger an asynchronous renewal of the
// lease.
func (cfg RaftConfig) RangeLeaseRenewalDuration() time.Duration {
if cfg.RangeLeaseRenewalFraction == -1 {
return 0
}
return time.Duration(cfg.RangeLeaseRenewalFraction * float64(cfg.RangeLeaseDuration))
}
// RangeLeaseAcquireTimeout is the timeout for lease acquisition.
func (cfg RaftConfig) RangeLeaseAcquireTimeout() time.Duration {
// The Raft election timeout is randomized by a factor of 1-2x per replica
// (the first one will trigger the election), and reproposing the lease
// acquisition command can take up to 1 Raft election timeout. On average, we
// should be able to elect a leader and acquire a lease within 2 election
// timeouts, assuming negligible RTT; otherwise, lease acquisition will
// typically be retried, only adding a bit of tail latency.
return 2 * cfg.RaftElectionTimeout()
}
// NodeLivenessDurations computes durations for node liveness expiration and
// renewal based on a default multiple of Raft election timeout.
func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal time.Duration) {
livenessActive = cfg.RangeLeaseDuration
livenessRenewal = time.Duration(float64(livenessActive) * livenessRenewalFraction)
return
}
// SentinelGossipTTL is time-to-live for the gossip sentinel. The sentinel
// informs a node whether or not it's connected to the primary gossip network
// and not just a partition. As such it must expire fairly quickly and be
// continually re-gossiped as a connected gossip network is necessary to
// propagate liveness. The replica which is the lease holder of the first range
// gossips it.
func (cfg RaftConfig) SentinelGossipTTL() time.Duration {
return cfg.RangeLeaseDuration
}
// DefaultRetryOptions should be used for retrying most
// network-dependent operations.
func DefaultRetryOptions() retry.Options {
// TODO(bdarnell): This should vary with network latency.
// Derive the retry options from a configured or measured
// estimate of latency.
return retry.Options{
InitialBackoff: 50 * time.Millisecond,
MaxBackoff: 1 * time.Second,
Multiplier: 2,
}
}
// maxInflightBytesFrom returns the minimal value for RaftMaxInflightBytes
// config option based on RaftMaxInflightMsgs and RaftMaxSizePerMsg.
func maxInflightBytesFrom(maxInflightMsgs int, maxSizePerMsg uint64) uint64 {
// Compute min(maxInflightMsgs * maxSizePerMsg, MaxUint64) safely.
if mul := new(big.Int).Mul(
big.NewInt(int64(maxInflightMsgs)),
new(big.Int).SetUint64(maxSizePerMsg),
); mul.IsUint64() {
return mul.Uint64()
}
return math.MaxUint64
}
// StorageConfig contains storage configs for all storage engine.
type StorageConfig struct {
Attrs roachpb.Attributes
// Dir is the data directory for the Pebble instance.
Dir string
// If true, creating the instance fails if the target directory does not hold
// an initialized instance.
//
// Makes no sense for in-memory instances.
MustExist bool
// MaxSize is used for calculating free space and making rebalancing
// decisions. Zero indicates that there is no maximum size.
MaxSize int64
// BallastSize is the amount reserved by a ballast file for manual
// out-of-disk recovery.
BallastSize int64
// Settings instance for cluster-wide knobs. Must not be nil.
Settings *cluster.Settings
// UseFileRegistry is true if the file registry is needed (eg: encryption-at-rest).
// This may force the store version to versionFileRegistry if currently lower.
UseFileRegistry bool
// EncryptionOptions is a serialized protobuf set by Go CCL code and passed
// through to C CCL code to set up encryption-at-rest. Must be set if and
// only if encryption is enabled, otherwise left empty.
EncryptionOptions []byte
}
// IsEncrypted returns whether the StorageConfig has encryption enabled.
func (sc StorageConfig) IsEncrypted() bool {
return len(sc.EncryptionOptions) > 0
}
const (
// DefaultTempStorageMaxSizeBytes is the default maximum budget
// for temp storage.
DefaultTempStorageMaxSizeBytes = 32 * 1024 * 1024 * 1024 /* 32GB */
// DefaultInMemTempStorageMaxSizeBytes is the default maximum budget
// for in-memory temp storages.
DefaultInMemTempStorageMaxSizeBytes = 100 * 1024 * 1024 /* 100MB */
)
// TempStorageConfig contains the details that can be specified in the cli
// pertaining to temp storage flags, specifically --temp-dir and
// --max-disk-temp-storage.
type TempStorageConfig struct {
// InMemory specifies whether the temporary storage will remain
// in-memory or occupy a temporary subdirectory on-disk.
InMemory bool
// Path is the filepath of the temporary subdirectory created for
// the temp storage.
Path string
// Mon will be used by the temp storage to register all its capacity requests.
// It can be used to limit the disk or memory that temp storage is allowed to
// use. If InMemory is set, than this has to be a memory monitor; otherwise it
// has to be a disk monitor.
Mon *mon.BytesMonitor
// Spec stores the StoreSpec this TempStorageConfig will use.
Spec StoreSpec
// Settings stores the cluster.Settings this TempStoreConfig will use. Must
// not be nil.
Settings *cluster.Settings
}
// ExternalIODirConfig describes various configuration options pertaining
// to external storage implementations.
// TODO(adityamaru): Rename ExternalIODirConfig to ExternalIOConfig because it
// is now used to configure both ExternalStorage and KMS.
type ExternalIODirConfig struct {
// Disables the use of external HTTP endpoints.
// This turns off http:// external storage as well as any custom
// endpoints cloud storage implementations.
DisableHTTP bool
// Disables the use of implicit credentials when accessing external services.
// Implicit credentials are obtained from the system environment.
// This turns off implicit credentials, and requires the user to provide
// necessary access keys.
DisableImplicitCredentials bool
// DisableOutbound disables the use of any external-io that dials out such as
// to s3, gcs, or even `nodelocal` as it may need to dial another node.
DisableOutbound bool
// EnableNonAdminImplicitAndArbitraryOutbound removes the usual restriction to
// only admin users placed on usage of node-granted access, such as to the
// implicit auth via the machine account for the node or to arbitrary network
// addresses (which are accessed from the node and might otherwise not be
// reachable). Instead, all users can use implicit auth, http addresses and
// configure custom endpoints. This should only be used if all users with SQL
// access should have access to anything the node has access to.
EnableNonAdminImplicitAndArbitraryOutbound bool
}
// TempStorageConfigFromEnv creates a TempStorageConfig.
// If parentDir is not specified and the specified store is in-memory,
// then the temp storage will also be in-memory.
func TempStorageConfigFromEnv(
ctx context.Context,
st *cluster.Settings,
useStore StoreSpec,
parentDir string,
maxSizeBytes int64,
) TempStorageConfig {
inMem := parentDir == "" && useStore.InMemory
var monitorName redact.RedactableString
if inMem {
monitorName = "in-mem temp storage"
} else {
monitorName = "temp disk storage"
}
monitor := mon.NewMonitor(
monitorName,
mon.DiskResource,
nil, /* curCount */
nil, /* maxHist */
1024*1024, /* increment */
maxSizeBytes/10, /* noteworthy */
st,
)
monitor.Start(ctx, nil /* pool */, mon.NewStandaloneBudget(maxSizeBytes))
return TempStorageConfig{
InMemory: inMem,
Mon: monitor,
Spec: useStore,
Settings: st,
}
}