Skip to content

Commit

Permalink
Merge pull request #497 from buildkite/SUP-3258-job-active-deadline-s…
Browse files Browse the repository at this point in the history
…econds

SUP-3258 - Implement .spec.job.activeDeadlineSeconds
  • Loading branch information
petetomasik authored Feb 13, 2025
2 parents 3b466a7 + 7a13241 commit 835cc76
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 25 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- [Skipping checkout](#Skipping-checkout)
- [Overriding flags for git clone/fetch](#Overriding-flags-for-git-clonefetch)
- [Validating your pipeline](#Validating-your-pipeline)
- [Long-running jobs](#long-running-jobs)
- [Securing the stack](#securing-the-stack)
- [Prohibiting the kubernetes plugin (v0.13.0 and later)](#prohibiting-the-kubernetes-plugin-v0130-and-later)
- [How to setup agent hooks](#How-to-setup-agent-hooks)
Expand Down Expand Up @@ -138,6 +139,7 @@ Flags:
--job-cancel-checker-poll-interval duration Controls the interval between job state queries while a pod is still Pending (default 5s)
--job-creation-concurrency int Number of concurrent goroutines to run for converting Buildkite jobs into Kubernetes jobs (default 5)
--job-ttl duration time to retain kubernetes jobs after completion (default 10m0s)
--job-active-deadline-seconds int maximum number of seconds a kubernetes job is allowed to run before terminating all pods and failing (default 21600)
--k8s-client-rate-limiter-burst int The burst value of the K8s client rate limiter. (default 20)
--k8s-client-rate-limiter-qps int The QPS value of the K8s client rate limiter. (default 10)
--max-in-flight int max jobs in flight, 0 means no max (default 25)
Expand Down Expand Up @@ -1030,6 +1032,28 @@ This currently can't prevent every sort of error, you might still have a referen
Our JSON schema can also be used with editors that support JSON Schema by configuring your editor to validate against the schema found [here](./cmd/linter/schema.json).
### Long-running jobs
With the addition of `.spec.job.activeDeadlineSeconds` in version [`v0.24.0`](https://github.com/buildkite/agent-stack-k8s/releases/tag/v0.24.0), Kubernetes jobs will run for a (default) maximum duration of `21600` seconds (6 hours). After this duration has been exceeded, all of the running Pods are terminated and the Job status will be `type: Failed`. This will be reflected in the Buildkite UI as `Exited with status -1 (agent lost)`.
If long-running jobs are common in your Organization, this value should be increased in your controller configuration:
```yaml
# values.yaml
...
config:
job-active-deadline-seconds: 86400 # 24h
...
```
It is also possible to override this configuration via the `kubernetes` plugin directly in your pipeline steps and will only apply to that `command` step:
```yaml
steps:
- label: Long-running job
command: echo "Hello world" && sleep 43200
plugins:
- kubernetes:
jobActiveDeadlineSeconds: 43500
```
## Securing the stack
### Prohibiting the kubernetes plugin (v0.13.0 and later)
Expand All @@ -1054,6 +1078,9 @@ With `prohibit-kubernetes-plugin` enabled, any job containing the kubernetes
plugin will fail.

## Debugging

Enable debug logging via the command line (`--debug`) or within the `values.yaml` file (`debug: true`)

Use the `log-collector` script in the `utils` folder to collect logs for agent-stack-k8s.

### Prerequisites
Expand Down
6 changes: 6 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@
"title": "The job-ttl Schema",
"examples": [""]
},
"job-active-deadline-seconds": {
"type": "integer",
"default": 21600,
"title": "The maximum number of seconds a kubernetes job is allowed to run before terminating all pods and failing the job",
"examples": [43200, 86400]
},
"max-in-flight": {
"type": "integer",
"default": 0,
Expand Down
5 changes: 5 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ func AddConfigFlags(cmd *cobra.Command) {
10*time.Minute,
"time to retain kubernetes jobs after completion",
)
cmd.Flags().Int(
"job-active-deadline-seconds",
21600,
"maximum number of seconds a kubernetes job is allowed to run before terminating all pods and failing job",
)
cmd.Flags().Duration(
"poll-interval",
time.Second,
Expand Down
1 change: 1 addition & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ func TestReadAndParseConfig(t *testing.T) {
BuildkiteToken: "my-graphql-enabled-token",
Image: "my.registry.dev/buildkite-agent:latest",
JobTTL: 300 * time.Second,
JobActiveDeadlineSeconds: 21600,
ImagePullBackOffGracePeriod: 60 * time.Second,
JobCancelCheckerPollInterval: 10 * time.Second,
EmptyJobGracePeriod: 50 * time.Second,
Expand Down
1 change: 1 addition & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ agent-token-secret: my-kubernetes-secret
debug: true
image: my.registry.dev/buildkite-agent:latest
job-ttl: 5m
job-active-deadline-seconds: 21600
image-pull-backoff-grace-period: 60s
job-cancel-checker-poll-interval: 10s
empty-job-grace-period: 50s
Expand Down
34 changes: 18 additions & 16 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,23 @@ var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
// mapstructure (the module) supports switching the struct tag to "json", viper does not. So we have
// to have the `mapstructure` tag for viper and the `json` tag is used by the mapstructure!
type Config struct {
Debug bool `json:"debug"`
JobTTL time.Duration `json:"job-ttl"`
PollInterval time.Duration `json:"poll-interval"`
StaleJobDataTimeout time.Duration `json:"stale-job-data-timeout" validate:"omitempty"`
JobCreationConcurrency int `json:"job-creation-concurrency" validate:"omitempty"`
AgentTokenSecret string `json:"agent-token-secret" validate:"required"`
BuildkiteToken string `json:"buildkite-token" validate:"required"`
Image string `json:"image" validate:"required"`
MaxInFlight int `json:"max-in-flight" validate:"min=0"`
Namespace string `json:"namespace" validate:"required"`
Org string `json:"org" validate:"required"`
Tags stringSlice `json:"tags" validate:"min=1"`
PrometheusPort uint16 `json:"prometheus-port" validate:"omitempty"`
ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"`
GraphQLEndpoint string `json:"graphql-endpoint" validate:"omitempty"`
GraphQLResultsLimit int `json:"graphql-results-limit" validate:"min=1,max=500"`
Debug bool `json:"debug"`
JobTTL time.Duration `json:"job-ttl"`
JobActiveDeadlineSeconds int `json:"job-active-deadline-seconds" validate:"required"`
PollInterval time.Duration `json:"poll-interval"`
StaleJobDataTimeout time.Duration `json:"stale-job-data-timeout" validate:"omitempty"`
JobCreationConcurrency int `json:"job-creation-concurrency" validate:"omitempty"`
AgentTokenSecret string `json:"agent-token-secret" validate:"required"`
BuildkiteToken string `json:"buildkite-token" validate:"required"`
Image string `json:"image" validate:"required"`
MaxInFlight int `json:"max-in-flight" validate:"min=0"`
Namespace string `json:"namespace" validate:"required"`
Org string `json:"org" validate:"required"`
Tags stringSlice `json:"tags" validate:"min=1"`
PrometheusPort uint16 `json:"prometheus-port" validate:"omitempty"`
ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"`
GraphQLEndpoint string `json:"graphql-endpoint" validate:"omitempty"`
GraphQLResultsLimit int `json:"graphql-results-limit" validate:"min=1,max=500"`
// Agent endpoint is set in agent-config.

K8sClientRateLimiterQPS int `json:"k8s-client-rate-limiter-qps" validate:"omitempty"`
Expand Down Expand Up @@ -94,6 +95,7 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddBool("debug", c.Debug)
enc.AddString("image", c.Image)
enc.AddDuration("job-ttl", c.JobTTL)
enc.AddInt("job-active-deadline-seconds", c.JobActiveDeadlineSeconds)
enc.AddDuration("poll-interval", c.PollInterval)
enc.AddDuration("stale-job-data-timeout", c.StaleJobDataTimeout)
enc.AddInt("job-creation-concurrency", c.JobCreationConcurrency)
Expand Down
1 change: 1 addition & 0 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ func Run(
Image: cfg.Image,
AgentTokenSecretName: cfg.AgentTokenSecret,
JobTTL: cfg.JobTTL,
JobActiveDeadlineSeconds: cfg.JobActiveDeadlineSeconds,
AdditionalRedactedVars: cfg.AdditionalRedactedVars,
WorkspaceVolume: cfg.WorkspaceVolume,
AgentConfig: cfg.AgentConfig,
Expand Down
28 changes: 19 additions & 9 deletions internal/controller/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type Config struct {
Image string
AgentTokenSecretName string
JobTTL time.Duration
JobActiveDeadlineSeconds int
AdditionalRedactedVars []string
WorkspaceVolume *corev1.Volume
AgentConfig *config.AgentConfig
Expand All @@ -76,15 +77,16 @@ func New(logger *zap.Logger, client kubernetes.Interface, cfg Config) *worker {
}

type KubernetesPlugin struct {
PodSpec *corev1.PodSpec `json:"podSpec,omitempty"`
PodSpecPatch *corev1.PodSpec `json:"podSpecPatch,omitempty"`
GitEnvFrom []corev1.EnvFromSource `json:"gitEnvFrom,omitempty"`
Sidecars []corev1.Container `json:"sidecars,omitempty"`
Metadata config.Metadata `json:"metadata,omitempty"`
ExtraVolumeMounts []corev1.VolumeMount `json:"extraVolumeMounts,omitempty"`
CheckoutParams *config.CheckoutParams `json:"checkout,omitempty"`
CommandParams *config.CommandParams `json:"commandParams,omitempty"`
SidecarParams *config.SidecarParams `json:"sidecarParams,omitempty"`
PodSpec *corev1.PodSpec `json:"podSpec,omitempty"`
PodSpecPatch *corev1.PodSpec `json:"podSpecPatch,omitempty"`
GitEnvFrom []corev1.EnvFromSource `json:"gitEnvFrom,omitempty"`
Sidecars []corev1.Container `json:"sidecars,omitempty"`
Metadata config.Metadata `json:"metadata,omitempty"`
ExtraVolumeMounts []corev1.VolumeMount `json:"extraVolumeMounts,omitempty"`
CheckoutParams *config.CheckoutParams `json:"checkout,omitempty"`
CommandParams *config.CommandParams `json:"commandParams,omitempty"`
SidecarParams *config.SidecarParams `json:"sidecarParams,omitempty"`
JobActiveDeadlineSeconds int `json:"jobActiveDeadlineSeconds,omitempty"`
}

type worker struct {
Expand Down Expand Up @@ -355,6 +357,14 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
ttl := int32(w.cfg.JobTTL.Seconds())
kjob.Spec.TTLSecondsAfterFinished = &ttl

activeDeadlineSeconds := int64(w.cfg.JobActiveDeadlineSeconds)
kjob.Spec.ActiveDeadlineSeconds = &activeDeadlineSeconds

if inputs.k8sPlugin != nil && int64(inputs.k8sPlugin.JobActiveDeadlineSeconds) > 0 {
activeDeadlineSeconds = int64(inputs.k8sPlugin.JobActiveDeadlineSeconds)
kjob.Spec.ActiveDeadlineSeconds = &activeDeadlineSeconds
}

// Env vars used for command containers
containerEnv := append([]corev1.EnvVar{}, env...)
containerEnv = append(containerEnv, []corev1.EnvVar{
Expand Down
8 changes: 8 additions & 0 deletions internal/integration/fixtures/jobactivedeadlineseconds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
steps:
- label: ":x:"
agents:
queue: "{{.queue}}"
command: sleep 90
plugins:
- kubernetes:
jobActiveDeadlineSeconds: 60
19 changes: 19 additions & 0 deletions internal/integration/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,3 +597,22 @@ func TestCancelCheckerEvictsPod(t *testing.T) {
t.Error("The agent ran and handled cancellation")
}
}

func TestJobActiveDeadlineSeconds(t *testing.T) {
tc := testcase{
T: t,
Fixture: "jobactivedeadlineseconds.yaml",
Repo: repoHTTP,
GraphQL: api.NewClient(cfg.BuildkiteToken, cfg.GraphQLEndpoint),
}.Init()
ctx := context.Background()
pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
tc.StartController(ctx, cfg)
build := tc.TriggerBuild(ctx, pipelineID)
tc.AssertFail(ctx, build)
time.Sleep(5 * time.Second) // trying to reduce flakes: logs not immediately available
logs := tc.FetchLogs(build)
if strings.Contains(logs, "Received cancellation signal, interrupting") {
t.Error("The agent ran and handled cancellation")
}
}

0 comments on commit 835cc76

Please sign in to comment.