diff --git a/config/tablet/default.yaml b/config/tablet/default.yaml index 427465d4598..d0553527f5a 100644 --- a/config/tablet/default.yaml +++ b/config/tablet/default.yaml @@ -118,6 +118,7 @@ cacheResultFields: true # enable-query-plan-field-caching # enable-tx-throttler # tx-throttler-config # tx-throttler-healthcheck-cells +# tx-throttler-tablet-types # enable_transaction_limit # enable_transaction_limit_dry_run # transaction_limit_per_user diff --git a/doc/design-docs/ReplicationLagBasedThrottlingOfTransactions.md b/doc/design-docs/ReplicationLagBasedThrottlingOfTransactions.md index ad1d98b151f..68686d4f72f 100644 --- a/doc/design-docs/ReplicationLagBasedThrottlingOfTransactions.md +++ b/doc/design-docs/ReplicationLagBasedThrottlingOfTransactions.md @@ -30,7 +30,13 @@ If this is not specified a [default](https://github.com/vitessio/vitess/tree/mai * *tx-throttler-healthcheck-cells* A comma separated list of datacenter cells. The throttler will only monitor -the non-RDONLY replicas found in these cells for replication lag. +the replicas found in these cells for replication lag. + +* *tx-throttler-tablet-types* + +A comma separated list of tablet types. The throttler will only monitor tablets +with these types. Only `replica` and/or `rdonly` types are supported. The default +is `replica`. # Caveats and Known Issues * The throttler keeps trying to explore the maximum rate possible while keeping @@ -39,4 +45,3 @@ lag limit may occasionally be slightly violated. * Transactions are considered homogeneous. There is currently no support for specifying how `expensive` a transaction is. - diff --git a/doc/design-docs/TabletServerParamsAsYAML.md b/doc/design-docs/TabletServerParamsAsYAML.md index 25543ad9018..5e2d02c90d7 100644 --- a/doc/design-docs/TabletServerParamsAsYAML.md +++ b/doc/design-docs/TabletServerParamsAsYAML.md @@ -146,6 +146,7 @@ sanitizeLogMessages: false # sanitize_log_messages # enable-tx-throttler # tx-throttler-config # tx-throttler-healthcheck-cells +# tx-throttler-tablet-types # enable_transaction_limit # enable_transaction_limit_dry_run # transaction_limit_per_user diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 6f6269e8fee..fc12f48f9ed 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -347,7 +347,8 @@ Usage of vttablet: --tx-throttler-config string Synonym to -tx_throttler_config (default "target_replication_lag_sec: 2\nmax_replication_lag_sec: 10\ninitial_rate: 100\nmax_increase: 1\nemergency_decrease: 0.5\nmin_duration_between_increases_sec: 40\nmax_duration_between_increases_sec: 62\nmin_duration_between_decreases_sec: 20\nspread_backlog_across_sec: 20\nage_bad_rate_after_sec: 180\nbad_rate_increase: 0.1\nmax_rate_approach_threshold: 0.9\n") --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells - --tx_throttler_config string The configuration of the transaction throttler as a text formatted throttlerdata.Configuration protocol buffer message (default "target_replication_lag_sec: 2\nmax_replication_lag_sec: 10\ninitial_rate: 100\nmax_increase: 1\nemergency_decrease: 0.5\nmin_duration_between_increases_sec: 40\nmax_duration_between_increases_sec: 62\nmin_duration_between_decreases_sec: 20\nspread_backlog_across_sec: 20\nage_bad_rate_after_sec: 180\nbad_rate_increase: 0.1\nmax_rate_approach_threshold: 0.9\n") + --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) + --tx_throttler_config string The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message. (default "target_replication_lag_sec: 2\nmax_replication_lag_sec: 10\ninitial_rate: 100\nmax_increase: 1\nemergency_decrease: 0.5\nmin_duration_between_increases_sec: 40\nmax_duration_between_increases_sec: 62\nmin_duration_between_decreases_sec: 20\nspread_backlog_across_sec: 20\nage_bad_rate_after_sec: 180\nbad_rate_increase: 0.1\nmax_rate_approach_threshold: 0.9\n") --tx_throttler_healthcheck_cells strings A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler. --unhealthy_threshold duration replication lag after which a replica is considered unhealthy (default 2h0m0s) --v Level log level for V logs diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index 96a4375bd47..046149cdcc6 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -32,9 +32,13 @@ import ( "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/throttler" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vterrors" ) // These constants represent values for various config parameters. @@ -153,9 +157,10 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { SecondsVar(fs, ¤tConfig.TwoPCAbandonAge, "twopc_abandon_age", defaultConfig.TwoPCAbandonAge, "time in seconds. Any unresolved transaction older than this time will be sent to the coordinator to be resolved.") // Tx throttler config flagutil.DualFormatBoolVar(fs, ¤tConfig.EnableTxThrottler, "enable_tx_throttler", defaultConfig.EnableTxThrottler, "If true replication-lag-based throttling on transactions will be enabled.") - flagutil.DualFormatStringVar(fs, ¤tConfig.TxThrottlerConfig, "tx_throttler_config", defaultConfig.TxThrottlerConfig, "The configuration of the transaction throttler as a text formatted throttlerdata.Configuration protocol buffer message") + flagutil.DualFormatStringVar(fs, ¤tConfig.TxThrottlerConfig, "tx_throttler_config", defaultConfig.TxThrottlerConfig, "The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message.") flagutil.DualFormatStringListVar(fs, ¤tConfig.TxThrottlerHealthCheckCells, "tx_throttler_healthcheck_cells", defaultConfig.TxThrottlerHealthCheckCells, "A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler.") fs.IntVar(¤tConfig.TxThrottlerDefaultPriority, "tx-throttler-default-priority", defaultConfig.TxThrottlerDefaultPriority, "Default priority assigned to queries that lack priority information") + fs.Var(currentConfig.TxThrottlerTabletTypes, "tx-throttler-tablet-types", "A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly.") fs.BoolVar(&enableHotRowProtection, "enable_hot_row_protection", false, "If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots.") fs.BoolVar(&enableHotRowProtectionDryRun, "enable_hot_row_protection_dry_run", false, "If true, hot row protection is not enforced but logs if transactions would have been queued.") @@ -332,10 +337,11 @@ type TabletConfig struct { TwoPCCoordinatorAddress string `json:"-"` TwoPCAbandonAge Seconds `json:"-"` - EnableTxThrottler bool `json:"-"` - TxThrottlerConfig string `json:"-"` - TxThrottlerHealthCheckCells []string `json:"-"` - TxThrottlerDefaultPriority int `json:"-"` + EnableTxThrottler bool `json:"-"` + TxThrottlerConfig string `json:"-"` + TxThrottlerHealthCheckCells []string `json:"-"` + TxThrottlerDefaultPriority int `json:"-"` + TxThrottlerTabletTypes *topoproto.TabletTypeListFlag `json:"-"` EnableLagThrottler bool `json:"-"` EnableTableGC bool `json:"-"` // can be turned off programmatically by tests @@ -634,6 +640,9 @@ func (c *TabletConfig) Verify() error { if err := c.verifyTransactionLimitConfig(); err != nil { return err } + if err := c.verifyTxThrottlerConfig(); err != nil { + return err + } if v := c.HotRowProtection.MaxQueueSize; v <= 0 { return fmt.Errorf("-hot_row_protection_max_queue_size must be > 0 (specified value: %v)", v) } @@ -682,6 +691,22 @@ func (c *TabletConfig) verifyTransactionLimitConfig() error { return nil } +// verifyTxThrottlerConfig checks the TxThrottler related config for sanity. +func (c *TabletConfig) verifyTxThrottlerConfig() error { + if c.TxThrottlerTabletTypes == nil || len(*c.TxThrottlerTabletTypes) == 0 { + return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "--tx-throttler-tablet-types must be defined when transaction throttler is enabled") + } + for _, tabletType := range *c.TxThrottlerTabletTypes { + switch tabletType { + case topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY: + continue + default: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "unsupported tablet type %q", tabletType) + } + } + return nil +} + // Some of these values are for documentation purposes. // They actually get overwritten during Init. var defaultConfig = TabletConfig{ @@ -767,6 +792,7 @@ var defaultConfig = TabletConfig{ TxThrottlerConfig: defaultTxThrottlerConfig(), TxThrottlerHealthCheckCells: []string{}, TxThrottlerDefaultPriority: sqlparser.MaxPriorityValue, // This leads to all queries being candidates to throttle + TxThrottlerTabletTypes: &topoproto.TabletTypeListFlag{topodatapb.TabletType_REPLICA}, EnableLagThrottler: false, // Feature flag; to switch to 'true' at some stage in the future diff --git a/go/vt/vttablet/tabletserver/tabletenv/config_test.go b/go/vt/vttablet/tabletserver/tabletenv/config_test.go index c2bb12f4874..0b1bd707de0 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config_test.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config_test.go @@ -26,6 +26,10 @@ import ( "vitess.io/vitess/go/test/utils" "vitess.io/vitess/go/vt/dbconfigs" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/yaml2" ) @@ -327,3 +331,32 @@ func TestFlags(t *testing.T) { want.SanitizeLogMessages = true assert.Equal(t, want, currentConfig) } + +func TestVerifyTxThrottlerConfig(t *testing.T) { + { + // default config (replica) + assert.Nil(t, currentConfig.verifyTxThrottlerConfig()) + } + { + // replica + rdonly (allowed) + currentConfig.TxThrottlerTabletTypes = &topoproto.TabletTypeListFlag{ + topodatapb.TabletType_REPLICA, + topodatapb.TabletType_RDONLY, + } + assert.Nil(t, currentConfig.verifyTxThrottlerConfig()) + } + { + // no tablet types + currentConfig.TxThrottlerTabletTypes = &topoproto.TabletTypeListFlag{} + err := currentConfig.verifyTxThrottlerConfig() + assert.NotNil(t, err) + assert.Equal(t, vtrpcpb.Code_FAILED_PRECONDITION, vterrors.Code(err)) + } + { + // disallowed tablet type + currentConfig.TxThrottlerTabletTypes = &topoproto.TabletTypeListFlag{topodatapb.TabletType_DRAINED} + err := currentConfig.verifyTxThrottlerConfig() + assert.NotNil(t, err) + assert.Equal(t, vtrpcpb.Code_INVALID_ARGUMENT, vterrors.Code(err)) + } +} diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index b65dbfc20df..d0b9013499d 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -33,11 +33,11 @@ import ( "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/throttler" "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" querypb "vitess.io/vitess/go/vt/proto/query" throttlerdatapb "vitess.io/vitess/go/vt/proto/throttlerdata" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) // These vars store the functions used to create the topo server, healthcheck, @@ -181,6 +181,7 @@ func tryCreateTxThrottler(env tabletenv.Env, topoServer *topo.Server) (*TxThrott return newTxThrottler(env, &txThrottlerConfig{ enabled: true, topoServer: topoServer, + tabletTypes: env.Config().TxThrottlerTabletTypes, throttlerConfig: &throttlerConfig, healthCheckCells: healthCheckCells, }) @@ -199,10 +200,15 @@ type txThrottlerConfig struct { // healthCheckCells stores the cell names in which running vttablets will be monitored for // replication lag. healthCheckCells []string + + // tabletTypes stores the tablet types for throttling + tabletTypes *topoproto.TabletTypeListFlag } // txThrottlerState holds the state of an open TxThrottler object. type txThrottlerState struct { + config *txThrottlerConfig + // throttleMu serializes calls to throttler.Throttler.Throttle(threadId). // That method is required to be called in serial for each threadId. throttleMu sync.Mutex @@ -300,6 +306,7 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard, cell string return nil, err } result := &txThrottlerState{ + config: config, throttler: t, } createTxThrottlerHealthCheck(config, result, cell) @@ -369,14 +376,16 @@ func (ts *txThrottlerState) deallocateResources() { // StatsUpdate updates the health of a tablet with the given healthcheck. func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { - // Ignore PRIMARY and RDONLY stats. - // We currently do not monitor RDONLY tablets for replication lag. RDONLY tablets are not - // candidates for becoming primary during failover, and it's acceptable to serve somewhat - // stale date from these. - // TODO(erez): If this becomes necessary, we can add a configuration option that would - // determine whether we consider RDONLY tablets here, as well. - if tabletStats.Target.TabletType != topodatapb.TabletType_REPLICA { + if ts.config.tabletTypes == nil { return } - ts.throttler.RecordReplicationLag(time.Now(), tabletStats) + + // Monitor tablets for replication lag if they have a tablet + // type specified by the --tx_throttler_tablet_types flag. + for _, expectedTabletType := range *ts.config.tabletTypes { + if tabletStats.Target.TabletType == expectedTabletType { + ts.throttler.RecordReplicationLag(time.Now(), tabletStats) + return + } + } } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index f55b0800ca4..bb2934a2bea 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -32,6 +32,7 @@ import ( "vitess.io/vitess/go/vt/throttler" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" querypb "vitess.io/vitess/go/vt/proto/query" @@ -110,8 +111,9 @@ func TestEnabledThrottler(t *testing.T) { config := tabletenv.NewDefaultConfig() config.EnableTxThrottler = true config.TxThrottlerHealthCheckCells = []string{"cell1", "cell2"} - env := tabletenv.NewEnv(config, t.Name()) + config.TxThrottlerTabletTypes = &topoproto.TabletTypeListFlag{topodatapb.TabletType_REPLICA} + env := tabletenv.NewEnv(config, t.Name()) throttler, err := tryCreateTxThrottler(env, ts) assert.Nil(t, err) throttler.InitDBConfig(&querypb.Target{