From aff55add7d1d65711cfb58834cfe1b5067e2ac31 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:41:36 -0600 Subject: [PATCH] backport of commit 4a8b01430b715dd73acd6d78d36ae3bdd7bc5055 (#19938) Co-authored-by: Luiz Aoqui --- .changelog/19933.txt | 3 ++ nomad/job_endpoint_test.go | 84 ++++++++++++++++++++++++++++++++++++++ scheduler/rank.go | 8 ++++ testutil/wait.go | 51 +++++++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 .changelog/19933.txt diff --git a/.changelog/19933.txt b/.changelog/19933.txt new file mode 100644 index 00000000000..008dab6bff6 --- /dev/null +++ b/.changelog/19933.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed a bug that caused blocked evaluations due to port conflict to not have a reason explaining why the evaluation was blocked +``` diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 7f3529fe3ab..0f757a5f180 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -2369,6 +2369,90 @@ func TestJobRegister_ACL_RejectedBySchedulerConfig(t *testing.T) { } } +func TestJobEndpoint_Register_PortCollistion(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + configFn func(c *Config) + }{ + { + name: "no preemption", + configFn: func(c *Config) { + c.DefaultSchedulerConfig = structs.SchedulerConfiguration{ + PreemptionConfig: structs.PreemptionConfig{ + ServiceSchedulerEnabled: false, + }, + } + }, + }, + { + name: "with preemption", + configFn: func(c *Config) { + c.DefaultSchedulerConfig = structs.SchedulerConfiguration{ + PreemptionConfig: structs.PreemptionConfig{ + ServiceSchedulerEnabled: true, + }, + } + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + s1, cleanupS1 := TestServer(t, tc.configFn) + defer cleanupS1() + state := s1.fsm.State() + + // Create test node. + node := mock.Node() + must.NoError(t, state.UpsertNode(structs.MsgTypeTestSetup, 1000, node)) + + // Create test job with a static port. + job := mock.Job() + job.TaskGroups[0].Count = 1 + job.TaskGroups[0].Networks[0].DynamicPorts = nil + job.TaskGroups[0].Networks[0].ReservedPorts = []structs.Port{ + {Label: "http", Value: 80}, + } + job.TaskGroups[0].Tasks[0].Services = nil + + testutil.RegisterJob(t, s1.RPC, job) + testutil.WaitForJobAllocStatus(t, s1.RPC, job, map[string]int{ + structs.AllocClientStatusPending: 1, + }) + + // Register second job with port conflict. + job2 := job.Copy() + job2.ID = fmt.Sprintf("conflict-%s", uuid.Generate()) + job2.Name = job2.ID + + testutil.RegisterJob(t, s1.RPC, job2) + + // Wait for job registration eval to complete. + evals := testutil.WaitForJobEvalStatus(t, s1.RPC, job2, map[string]int{ + structs.EvalStatusComplete: 1, + structs.EvalStatusBlocked: 1, + }) + + var blockedEval *structs.Evaluation + for _, e := range evals { + if e.Status == structs.EvalStatusBlocked { + blockedEval = e + break + } + } + + // Ensure blocked eval is properly annotated. + must.MapLen(t, 1, blockedEval.FailedTGAllocs) + must.NotNil(t, blockedEval.FailedTGAllocs["web"]) + must.Eq(t, map[string]int{ + "network: reserved port collision http=80": 1, + }, blockedEval.FailedTGAllocs["web"].DimensionExhausted) + }) + } +} + func TestJobEndpoint_Revert(t *testing.T) { ci.Parallel(t) diff --git a/scheduler/rank.go b/scheduler/rank.go index ae6b68c54e2..b38ad9243c1 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -313,6 +313,8 @@ OUTER: netPreemptions := preemptor.PreemptForNetwork(ask, netIdx) if netPreemptions == nil { iter.ctx.Logger().Named("binpack").Debug("preemption not possible ", "network_resource", ask) + iter.ctx.Metrics().ExhaustedNode(option.Node, + fmt.Sprintf("network: %s", err)) netIdx.Release() continue OUTER } @@ -330,6 +332,8 @@ OUTER: offer, err = netIdx.AssignPorts(ask) if err != nil { iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create network offer after considering preemption", "error", err) + iter.ctx.Metrics().ExhaustedNode(option.Node, + fmt.Sprintf("network: %s", err)) netIdx.Release() continue OUTER } @@ -383,6 +387,8 @@ OUTER: netPreemptions := preemptor.PreemptForNetwork(ask, netIdx) if netPreemptions == nil { iter.ctx.Logger().Named("binpack").Debug("preemption not possible ", "network_resource", ask) + iter.ctx.Metrics().ExhaustedNode(option.Node, + fmt.Sprintf("network: %s", err)) netIdx.Release() continue OUTER } @@ -400,6 +406,8 @@ OUTER: offer, err = netIdx.AssignTaskNetwork(ask) if offer == nil { iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create network offer after considering preemption", "error", err) + iter.ctx.Metrics().ExhaustedNode(option.Node, + fmt.Sprintf("network: %s", err)) netIdx.Release() continue OUTER } diff --git a/testutil/wait.go b/testutil/wait.go index 3466f497b39..8f97a0945ee 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -359,6 +359,57 @@ func WaitForJobAllocStatusWithToken(t testing.TB, rpc rpcFn, job *structs.Job, a return allocs } +// WaitforJobEvalStatus blocks until the job's evals match the status described +// in the map of : . +func WaitForJobEvalStatus(t testing.TB, rpc rpcFn, job *structs.Job, evalStatus map[string]int) []*structs.Evaluation { + return WaitForJobEvalStatusWithToken(t, rpc, job, evalStatus, "") +} + +// WaitForJobEvalStatusWithToken is the same as WaitforJobEvalStatus with ACL +// enabled. +func WaitForJobEvalStatusWithToken(t testing.TB, rpc rpcFn, job *structs.Job, evalStatus map[string]int, token string) []*structs.Evaluation { + var evals []*structs.Evaluation + + errorFunc := func() error { + req := &structs.JobSpecificRequest{ + JobID: job.ID, + QueryOptions: structs.QueryOptions{ + AuthToken: token, + Namespace: job.Namespace, + Region: job.Region, + }, + } + var resp structs.JobEvaluationsResponse + err := rpc("Job.Evaluations", req, &resp) + if err != nil { + return fmt.Errorf("failed to call Job.Evaluations RPC: %w", err) + } + + got := make(map[string]int) + for _, eval := range resp.Evaluations { + got[eval.Status]++ + } + + if diff := cmp.Diff(evalStatus, got); diff != "" { + return fmt.Errorf("eval status mismatch (-want +got):\n%s", diff) + } + + evals = resp.Evaluations + return nil + } + + must.Wait(t, + wait.InitialSuccess( + wait.ErrorFunc(errorFunc), + wait.Timeout(time.Duration(TestMultiplier())*time.Second), + wait.Gap(10*time.Millisecond), + ), + must.Sprintf("failed to wait for job %s eval status", job.ID), + ) + + return evals +} + // WaitForFiles blocks until all the files in the slice are present func WaitForFiles(t testing.TB, files []string) { WaitForResult(func() (bool, error) {