Skip to content

Commit

Permalink
Exclude non-ready nodes from scheduling.
Browse files Browse the repository at this point in the history
  • Loading branch information
mszadkow committed Nov 1, 2024
1 parent 6047afe commit 275a18a
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 6 deletions.
161 changes: 161 additions & 0 deletions pkg/cache/tas_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -71,6 +77,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -87,6 +99,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -103,6 +121,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -119,6 +143,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -135,6 +165,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("4Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
}
Expand Down Expand Up @@ -185,6 +221,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -200,6 +242,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -215,6 +263,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -230,6 +284,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -245,6 +305,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -260,6 +326,12 @@ func TestFindTopologyAssignment(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -656,6 +728,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -698,6 +776,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -726,6 +810,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -768,6 +858,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -816,6 +912,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -850,6 +952,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -885,6 +993,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
Expand All @@ -899,6 +1013,12 @@ func TestFindTopologyAssignment(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down Expand Up @@ -927,6 +1047,47 @@ func TestFindTopologyAssignment(t *testing.T) {
},
},
},
"no assignment as node is not ready": {
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "b1-r1-x1",
Labels: map[string]string{
"zone": "zone-a",
tasHostLabel: "x1",
},
},
Status: corev1.NodeStatus{
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeNetworkUnavailable,
Status: corev1.ConditionTrue,
},
},
},
},
},
request: kueue.PodSetTopologyRequest{
Required: ptr.To(tasHostLabel),
},
nodeLabels: map[string]string{
"zone": "zone-a",
},
levels: defaultOneLevel,
requests: resources.Requests{
corev1.ResourceCPU: 1000,
},
count: 1,
wantAssignment: nil,
},
}
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
Expand Down
21 changes: 15 additions & 6 deletions pkg/cache/tas_flavor.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,21 @@ func (c *TASFlavorCache) snapshotForNodes(log logr.Logger, nodes []corev1.Node,
snapshot := newTASFlavorSnapshot(log, c.Levels)
nodeToDomain := make(map[string]utiltas.TopologyDomainID)
for _, node := range nodes {
levelValues := utiltas.LevelValues(c.Levels, node.Labels)
capacity := resources.NewRequests(node.Status.Allocatable)
domainID := utiltas.DomainID(levelValues)
snapshot.levelValuesPerDomain[domainID] = levelValues
snapshot.addCapacity(domainID, capacity)
nodeToDomain[node.Name] = domainID
ready := false
for _, cond := range node.Status.Conditions {
// Only healthy and ready to accept pods nodes are considered for scheduling calculation
ready = (cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue)
}
if ready {
levelValues := utiltas.LevelValues(c.Levels, node.Labels)
capacity := resources.NewRequests(node.Status.Allocatable)
domainID := utiltas.DomainID(levelValues)
snapshot.levelValuesPerDomain[domainID] = levelValues
snapshot.addCapacity(domainID, capacity)
nodeToDomain[node.Name] = domainID
} else {
log.V(3).Info("Node was excluded from TAS Flavor snapshot", "nodeName", node.Name, "nodeStatusConditions", node.Status.Conditions)
}
}
snapshot.initialize()
for domainID, usage := range c.usage {
Expand Down
12 changes: 12 additions & 0 deletions pkg/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3881,6 +3881,12 @@ func TestScheduleForTAS(t *testing.T) {
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
}
Expand Down Expand Up @@ -4202,6 +4208,12 @@ func TestScheduleForTAS(t *testing.T) {
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
},
Expand Down
6 changes: 6 additions & 0 deletions test/integration/controller/jobs/job/job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2208,6 +2208,12 @@ var _ = ginkgo.Describe("Job controller when TopologyAwareScheduling enabled", g
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
}
Expand Down
Loading

0 comments on commit 275a18a

Please sign in to comment.