Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TAS: introduce scheduler-level tests #3361

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/cache/clusterqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ func (c *clusterQueue) updateClusterQueue(cycleChecker hierarchy.CycleChecker, i

c.AdmissionChecks = utilac.NewAdmissionChecks(in)

c.UpdateWithFlavors(resourceFlavors)
c.updateWithAdmissionChecks(admissionChecks)

if in.Spec.Preemption != nil {
c.Preemption = *in.Spec.Preemption
} else {
c.Preemption = defaultPreemption
}

c.UpdateWithFlavors(resourceFlavors)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: The new unit test discovered this subtle bug - the UpdateWithFlavors requires the Preemption to be set, so that the check for Preemption.withinClusterQueue != "Never" is correct (otherwise it is "", so considered not equal, and the CQ is inactivated).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked this bug in the flow (c.UpdateWithFlavors() -> c.updateQueueStatus() -> c.isTASViolated()) and (c.updateWithAdmissionChecks() -> c.updateQueueStatus() -> c.isTASViolated()).

c.updateWithAdmissionChecks(admissionChecks)

if in.Spec.FlavorFungibility != nil {
c.FlavorFungibility = *in.Spec.FlavorFungibility
if c.FlavorFungibility.WhenCanBorrow == "" {
Expand Down
66 changes: 0 additions & 66 deletions pkg/cache/tas_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,10 @@ import (
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
"sigs.k8s.io/kueue/pkg/resources"
"sigs.k8s.io/kueue/pkg/util/limitrange"
utiltas "sigs.k8s.io/kueue/pkg/util/tas"
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
testingpod "sigs.k8s.io/kueue/pkg/util/testingjobs/pod"
"sigs.k8s.io/kueue/pkg/workload"
)

func TestFindTopologyAssignment(t *testing.T) {
Expand Down Expand Up @@ -805,51 +801,6 @@ func TestFindTopologyAssignment(t *testing.T) {
},
},
},
"don't double-count TAS pods when computing the capacity": {
Copy link
Contributor Author

@mimowo mimowo Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI this scenario is moved to the new suite as "workload gets scheduled as the usage of TAS pods and workloads is not double-counted", since it was tricky to account for the Workloads' usage in this setup (see comment below). So the test used to create a "TAS" Pod, but it didn't belong to any TAS workload. The new suite allows to account for Workloads and Pods.

nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "x1",
Labels: map[string]string{
tasHostLabel: "x1",
},
},
Status: corev1.NodeStatus{
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
},
},
},
pods: []corev1.Pod{
*testingpod.MakePod("test-tas", "test-ns").NodeName("x1").
Request(corev1.ResourceCPU, "400m").
NodeSelector(tasHostLabel, "x1").
Label(kueuealpha.TASLabel, "true").
StatusPhase(corev1.PodRunning).
Obj(),
},
request: kueue.PodSetTopologyRequest{
Required: ptr.To(tasHostLabel),
},
levels: defaultOneLevel,
requests: resources.Requests{
corev1.ResourceCPU: 600,
},
count: 1,
wantAssignment: &kueue.TopologyAssignment{
Levels: defaultOneLevel,
Domains: []kueue.TopologyDomainAssignment{
{
Count: 1,
Values: []string{
"x1",
},
},
},
},
},
"include usage from pending scheduled non-TAS pods, blocked assignment": {
// there is not enough free capacity on the only node x1
nodes: []corev1.Node{
Expand Down Expand Up @@ -995,23 +946,6 @@ func TestFindTopologyAssignment(t *testing.T) {
tasCache := NewTASCache(client)
tasFlavorCache := tasCache.NewTASFlavorCache(tc.levels, tc.nodeLabels)

// account for usage from TAS pods for the need of the unit tests.
// note that in practice the usage is accounted based on TAS
// Workloads.
for _, pod := range tc.pods {
if _, ok := pod.Labels[kueuealpha.TASLabel]; ok {
levelValues := utiltas.LevelValues(tc.levels, pod.Spec.NodeSelector)
requests := limitrange.TotalRequests(&pod.Spec)
usage := resources.NewRequests(requests)
tasFlavorCache.addUsage([]workload.TopologyDomainRequests{
{
Values: levelValues,
Requests: usage,
},
})
}
}

snapshot, err := tasFlavorCache.snapshot(ctx)
if err != nil {
t.Fatalf("failed to build the snapshot: %v", err)
Expand Down
11 changes: 2 additions & 9 deletions pkg/controller/tas/resource_flavor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"sigs.k8s.io/kueue/pkg/cache"
"sigs.k8s.io/kueue/pkg/controller/core"
"sigs.k8s.io/kueue/pkg/queue"
utiltas "sigs.k8s.io/kueue/pkg/util/tas"
)

const (
Expand Down Expand Up @@ -152,7 +153,7 @@ func (r *rfReconciler) Reconcile(ctx context.Context, req reconcile.Request) (re
}, &topology); err != nil {
return reconcile.Result{}, err
}
levels := r.levels(&topology)
levels := utiltas.Levels(&topology)
tasInfo := r.tasCache.NewTASFlavorCache(levels, flv.Spec.NodeLabels)
r.tasCache.Set(kueue.ResourceFlavorReference(flv.Name), tasInfo)
}
Expand Down Expand Up @@ -221,11 +222,3 @@ func nodeBelongsToFlavor(node *corev1.Node, nodeLabels map[string]string, levels
}
return true
}

func (r *rfReconciler) levels(topology *kueuealpha.Topology) []string {
result := make([]string, len(topology.Spec.Levels))
for i, level := range topology.Spec.Levels {
result[i] = level.NodeLabel
}
return result
}
Loading