Skip to content

Commit

Permalink
Merge pull request #489 from buildkite/feat/add-qps-and-burst-settings
Browse files Browse the repository at this point in the history
Allow configurable QPS, Burst settings with Kubernetes client, increase defaults
  • Loading branch information
tomowatt authored Jan 31, 2025
2 parents 107b1cf + 2e31a52 commit e18ad9e
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ Flags:
--image string The image to use for the Buildkite agent (default "ghcr.io/buildkite/agent:3.78.0")
--image-pull-backoff-grace-period duration Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled) (default 30s)
--job-ttl duration time to retain kubernetes jobs after completion (default 10m0s)
--k8s-client-rate-limiter-qps int number of queries per second allowed to Kubernetes API, once Burst has been exhausted (default 10)
--k8s-client-rate-limiter-burst int number of queries allowed before throttling requests to Kubernetes API, before using QPS (default 20)
--max-in-flight int max jobs in flight, 0 means no max (default 25)
--namespace string kubernetes namespace to create resources in (default "default")
--org string Buildkite organization name to watch
Expand Down
12 changes: 12 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,18 @@
"title": "Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled). Must be a Go duration string",
"examples": ["60s"]
},
"k8s-client-rate-limiter-qps": {
"type": "integer",
"default": 10,
"title": "QPS indicates the number of queries per second from this client to the Kubernetes API, once the number of queries defined for Burst has been exhausted. Used together with k8s-client-rate-limiter-burst.",
"examples": [20, 30, 50]
},
"k8s-client-rate-limiter-burst": {
"type": "integer",
"default": 20,
"title": "Maximum number of queries allowed before throttling requests (via QPS) to the Kubernetes API. Used together with k8s-client-rate-limiter-qps.",
"examples": [30, 50, 100]
},
"job-cancel-checker-poll-interval": {
"type": "string",
"default": "5s",
Expand Down
12 changes: 12 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ func AddConfigFlags(cmd *cobra.Command) {
config.DefaultJobCreationConcurrency,
"Number of concurrent goroutines to run for converting Buildkite jobs into Kubernetes jobs",
)
cmd.Flags().Int(
"k8s-client-rate-limiter-qps",
config.DefaultK8sClientRateLimiterQPS,
"The QPS value of the K8s client rate limiter.",
)
cmd.Flags().Int(
"k8s-client-rate-limiter-burst",
config.DefaultK8sClientRateLimiterBurst,
"The burst value of the K8s client rate limiter.",
)
cmd.Flags().Duration(
"image-pull-backoff-grace-period",
config.DefaultImagePullBackOffGracePeriod,
Expand Down Expand Up @@ -279,6 +289,8 @@ func New() *cobra.Command {
logger.Info("configuration loaded", zap.Object("config", cfg))

clientConfig := restconfig.GetConfigOrDie()
clientConfig.QPS = float32(cfg.K8sClientRateLimiterQPS)
clientConfig.Burst = cfg.K8sClientRateLimiterBurst
k8sClient, err := kubernetes.NewForConfig(clientConfig)
if err != nil {
logger.Error("failed to create clientset", zap.Error(err))
Expand Down
2 changes: 2 additions & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ func TestReadAndParseConfig(t *testing.T) {
StaleJobDataTimeout: 10 * time.Second,
JobCreationConcurrency: 5,
MaxInFlight: 100,
K8sClientRateLimiterQPS: 20,
K8sClientRateLimiterBurst: 30,
Namespace: "my-buildkite-ns",
Org: "my-buildkite-org",
Tags: []string{"queue=my-queue", "priority=high"},
Expand Down
2 changes: 2 additions & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ poll-interval: 5s
stale-job-data-timeout: 10s
job-creation-concurrency: 5
max-in-flight: 100
k8s-client-rate-limiter-qps: 20
k8s-client-rate-limiter-burst: 30
namespace: my-buildkite-ns
org: my-buildkite-org
default-image-pull-policy: Never
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
DefaultJobCancelCheckerPollInterval = 5 * time.Second
DefaultEmptyJobGracePeriod = 30 * time.Second
DefaultJobCreationConcurrency = 5
DefaultK8sClientRateLimiterQPS = 10
DefaultK8sClientRateLimiterBurst = 20
)

var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
Expand All @@ -45,6 +47,9 @@ type Config struct {
GraphQLEndpoint string `json:"graphql-endpoint" validate:"omitempty"`
// Agent endpoint is set in agent-config.

K8sClientRateLimiterQPS int `json:"k8s-client-rate-limiter-qps" validate:"omitempty"`
K8sClientRateLimiterBurst int `json:"k8s-client-rate-limiter-burst" validate:"omitempty"`

// ClusterUUID field is mandatory for most new orgs.
// Some old orgs allows unclustered setup.
ClusterUUID string `json:"cluster-uuid" validate:"omitempty"`
Expand Down

0 comments on commit e18ad9e

Please sign in to comment.