Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TAS: skip flavor checks which don't have requested level #3399

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/cache/tas_flavor_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ func (s *TASFlavorSnapshot) FindTopologyAssignment(
return s.buildAssignment(currFitDomain)
}

func (s *TASFlavorSnapshot) HasLevel(r *kueue.PodSetTopologyRequest) bool {
_, found := s.resolveLevelIdx(r)
return found
}

func (s *TASFlavorSnapshot) resolveLevelIdx(
topologyRequest *kueue.PodSetTopologyRequest) (int, bool) {
var levelKey string
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/flavorassigner/flavorassigner.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ func (a *FlavorAssigner) findFlavorForPodSetResource(
continue
}
if features.Enabled(features.TopologyAwareScheduling) {
if message := checkPodSetAndFlavorMatchForTAS(ps, flavor); message != nil {
if message := checkPodSetAndFlavorMatchForTAS(a.cq, ps, flavor); message != nil {
log.Error(nil, *message)
status.append(*message)
continue
Expand Down
19 changes: 16 additions & 3 deletions pkg/scheduler/flavorassigner/tas_flavorassigner.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,23 @@ func onlyFlavor(ra ResourceAssignment) (*kueue.ResourceFlavorReference, error) {
return nil, errors.New("no flavor assigned")
}

func checkPodSetAndFlavorMatchForTAS(ps *kueue.PodSet, flavor *kueue.ResourceFlavor) *string {
func checkPodSetAndFlavorMatchForTAS(cq *cache.ClusterQueueSnapshot, ps *kueue.PodSet, flavor *kueue.ResourceFlavor) *string {
// For PodSets which require TAS skip resource flavors which don't support it
if ps.TopologyRequest != nil && flavor.Spec.TopologyName == nil {
return ptr.To(fmt.Sprintf("Flavor %q does not support TopologyAwareScheduling", flavor.Name))
if ps.TopologyRequest != nil {
if flavor.Spec.TopologyName == nil {
return ptr.To(fmt.Sprintf("Flavor %q does not support TopologyAwareScheduling", flavor.Name))
}
s := cq.TASFlavors[kueue.ResourceFlavorReference(flavor.Name)]
if s == nil {
// Skip Flavors if they don't have TAS information. This should generally
// not happen, but possible in race-situation when the ResourceFlavor
// API object was recently added but is not cached yet.
return ptr.To(fmt.Sprintf("Flavor %q information missing in TAS cache", flavor.Name))
}
if !s.HasLevel(ps.TopologyRequest) {
// Skip flavors which don't have the requested level
return ptr.To(fmt.Sprintf("Flavor %q does not contain the requested level", flavor.Name))
}
}
// For PodSets which don't use TAS skip resource flavors which are only for TAS
if ps.TopologyRequest == nil && flavor.Spec.TopologyName != nil {
Expand Down
118 changes: 118 additions & 0 deletions pkg/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4162,6 +4162,124 @@ func TestScheduleForTAS(t *testing.T) {
},
},
},
"workload requests topology level which is not present in topology": {
nodes: defaultSingleNode,
topologies: []kueuealpha.Topology{defaultSingleLevelTopology},
resourceFlavors: []kueue.ResourceFlavor{defaultTASFlavor},
clusterQueues: []kueue.ClusterQueue{defaultClusterQueue},
workloads: []kueue.Workload{
*utiltesting.MakeWorkload("foo", "default").
Queue("tas-main").
PodSets(*utiltesting.MakePodSet("one", 1).
RequiredTopologyRequest("cloud.com/non-existing").
Request(corev1.ResourceCPU, "1").
Obj()).
Obj(),
},
wantInadmissibleLeft: map[string][]string{
"tas-main": {"default/foo"},
},
wantEvents: []utiltesting.EventRecord{
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
EventType: "Warning",
Reason: "Pending",
Message: "failed to assign flavors to pod set one: no flavor assigned",
},
},
},
"workload requests topology level which is only present in second flavor": {
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "x1",
Labels: map[string]string{
"tas-node": "true",
"cloud.com/custom-level": "x1",
},
},
Status: corev1.NodeStatus{
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
},
},
},
topologies: []kueuealpha.Topology{defaultSingleLevelTopology,
{
ObjectMeta: metav1.ObjectMeta{
Name: "tas-custom-topology",
},
Spec: kueuealpha.TopologySpec{
Levels: []kueuealpha.TopologyLevel{
{
NodeLabel: "cloud.com/custom-level",
},
},
},
},
},
resourceFlavors: []kueue.ResourceFlavor{defaultTASFlavor,
{
ObjectMeta: metav1.ObjectMeta{
Name: "tas-custom-flavor",
},
Spec: kueue.ResourceFlavorSpec{
NodeLabels: map[string]string{
"tas-node": "true",
},
TopologyName: ptr.To("tas-custom-topology"),
},
},
},
clusterQueues: []kueue.ClusterQueue{
*utiltesting.MakeClusterQueue("tas-main").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("tas-default").
Resource(corev1.ResourceCPU, "50").Obj(),
*utiltesting.MakeFlavorQuotas("tas-custom-flavor").
Resource(corev1.ResourceCPU, "50").Obj()).
Obj(),
},
workloads: []kueue.Workload{
*utiltesting.MakeWorkload("foo", "default").
Queue("tas-main").
PodSets(*utiltesting.MakePodSet("one", 1).
RequiredTopologyRequest("cloud.com/custom-level").
Request(corev1.ResourceCPU, "1").
Obj()).
Obj(),
},
wantNewAssignments: map[string]kueue.Admission{
"default/foo": *utiltesting.MakeAdmission("tas-main", "one").
Assignment(corev1.ResourceCPU, "tas-custom-flavor", "1").
AssignmentPodCount(1).
TopologyAssignment(&kueue.TopologyAssignment{
Levels: []string{"cloud.com/custom-level"},
Domains: []kueue.TopologyDomainAssignment{
{
Count: 1,
Values: []string{
"x1",
},
},
},
}).Obj(),
},
eventCmpOpts: []cmp.Option{eventIgnoreMessage},
wantEvents: []utiltesting.EventRecord{
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
Reason: "QuotaReserved",
EventType: corev1.EventTypeNormal,
},
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
Reason: "Admitted",
EventType: corev1.EventTypeNormal,
},
},
},
"workload does not get scheduled as it does not fit within the node capacity": {
nodes: defaultSingleNode,
topologies: []kueuealpha.Topology{defaultSingleLevelTopology},
Expand Down