Skip to content

Commit

Permalink
TAS: skip flavor checks which don't have requested level (kubernetes-…
Browse files Browse the repository at this point in the history
  • Loading branch information
mimowo authored and kannon92 committed Nov 19, 2024
1 parent 85c92fd commit 26cc32d
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 4 deletions.
5 changes: 5 additions & 0 deletions pkg/cache/tas_flavor_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ func (s *TASFlavorSnapshot) FindTopologyAssignment(
return s.buildAssignment(currFitDomain)
}

func (s *TASFlavorSnapshot) HasLevel(r *kueue.PodSetTopologyRequest) bool {
_, found := s.resolveLevelIdx(r)
return found
}

func (s *TASFlavorSnapshot) resolveLevelIdx(
topologyRequest *kueue.PodSetTopologyRequest) (int, bool) {
var levelKey string
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/flavorassigner/flavorassigner.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ func (a *FlavorAssigner) findFlavorForPodSetResource(
continue
}
if features.Enabled(features.TopologyAwareScheduling) {
if message := checkPodSetAndFlavorMatchForTAS(ps, flavor); message != nil {
if message := checkPodSetAndFlavorMatchForTAS(a.cq, ps, flavor); message != nil {
log.Error(nil, *message)
status.append(*message)
continue
Expand Down
19 changes: 16 additions & 3 deletions pkg/scheduler/flavorassigner/tas_flavorassigner.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,23 @@ func onlyFlavor(ra ResourceAssignment) (*kueue.ResourceFlavorReference, error) {
return nil, errors.New("no flavor assigned")
}

func checkPodSetAndFlavorMatchForTAS(ps *kueue.PodSet, flavor *kueue.ResourceFlavor) *string {
func checkPodSetAndFlavorMatchForTAS(cq *cache.ClusterQueueSnapshot, ps *kueue.PodSet, flavor *kueue.ResourceFlavor) *string {
// For PodSets which require TAS skip resource flavors which don't support it
if ps.TopologyRequest != nil && flavor.Spec.TopologyName == nil {
return ptr.To(fmt.Sprintf("Flavor %q does not support TopologyAwareScheduling", flavor.Name))
if ps.TopologyRequest != nil {
if flavor.Spec.TopologyName == nil {
return ptr.To(fmt.Sprintf("Flavor %q does not support TopologyAwareScheduling", flavor.Name))
}
s := cq.TASFlavors[kueue.ResourceFlavorReference(flavor.Name)]
if s == nil {
// Skip Flavors if they don't have TAS information. This should generally
// not happen, but possible in race-situation when the ResourceFlavor
// API object was recently added but is not cached yet.
return ptr.To(fmt.Sprintf("Flavor %q information missing in TAS cache", flavor.Name))
}
if !s.HasLevel(ps.TopologyRequest) {
// Skip flavors which don't have the requested level
return ptr.To(fmt.Sprintf("Flavor %q does not contain the requested level", flavor.Name))
}
}
// For PodSets which don't use TAS skip resource flavors which are only for TAS
if ps.TopologyRequest == nil && flavor.Spec.TopologyName != nil {
Expand Down
118 changes: 118 additions & 0 deletions pkg/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4162,6 +4162,124 @@ func TestScheduleForTAS(t *testing.T) {
},
},
},
"workload requests topology level which is not present in topology": {
nodes: defaultSingleNode,
topologies: []kueuealpha.Topology{defaultSingleLevelTopology},
resourceFlavors: []kueue.ResourceFlavor{defaultTASFlavor},
clusterQueues: []kueue.ClusterQueue{defaultClusterQueue},
workloads: []kueue.Workload{
*utiltesting.MakeWorkload("foo", "default").
Queue("tas-main").
PodSets(*utiltesting.MakePodSet("one", 1).
RequiredTopologyRequest("cloud.com/non-existing").
Request(corev1.ResourceCPU, "1").
Obj()).
Obj(),
},
wantInadmissibleLeft: map[string][]string{
"tas-main": {"default/foo"},
},
wantEvents: []utiltesting.EventRecord{
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
EventType: "Warning",
Reason: "Pending",
Message: "failed to assign flavors to pod set one: no flavor assigned",
},
},
},
"workload requests topology level which is only present in second flavor": {
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "x1",
Labels: map[string]string{
"tas-node": "true",
"cloud.com/custom-level": "x1",
},
},
Status: corev1.NodeStatus{
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
},
},
},
},
topologies: []kueuealpha.Topology{defaultSingleLevelTopology,
{
ObjectMeta: metav1.ObjectMeta{
Name: "tas-custom-topology",
},
Spec: kueuealpha.TopologySpec{
Levels: []kueuealpha.TopologyLevel{
{
NodeLabel: "cloud.com/custom-level",
},
},
},
},
},
resourceFlavors: []kueue.ResourceFlavor{defaultTASFlavor,
{
ObjectMeta: metav1.ObjectMeta{
Name: "tas-custom-flavor",
},
Spec: kueue.ResourceFlavorSpec{
NodeLabels: map[string]string{
"tas-node": "true",
},
TopologyName: ptr.To("tas-custom-topology"),
},
},
},
clusterQueues: []kueue.ClusterQueue{
*utiltesting.MakeClusterQueue("tas-main").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("tas-default").
Resource(corev1.ResourceCPU, "50").Obj(),
*utiltesting.MakeFlavorQuotas("tas-custom-flavor").
Resource(corev1.ResourceCPU, "50").Obj()).
Obj(),
},
workloads: []kueue.Workload{
*utiltesting.MakeWorkload("foo", "default").
Queue("tas-main").
PodSets(*utiltesting.MakePodSet("one", 1).
RequiredTopologyRequest("cloud.com/custom-level").
Request(corev1.ResourceCPU, "1").
Obj()).
Obj(),
},
wantNewAssignments: map[string]kueue.Admission{
"default/foo": *utiltesting.MakeAdmission("tas-main", "one").
Assignment(corev1.ResourceCPU, "tas-custom-flavor", "1").
AssignmentPodCount(1).
TopologyAssignment(&kueue.TopologyAssignment{
Levels: []string{"cloud.com/custom-level"},
Domains: []kueue.TopologyDomainAssignment{
{
Count: 1,
Values: []string{
"x1",
},
},
},
}).Obj(),
},
eventCmpOpts: []cmp.Option{eventIgnoreMessage},
wantEvents: []utiltesting.EventRecord{
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
Reason: "QuotaReserved",
EventType: corev1.EventTypeNormal,
},
{
Key: types.NamespacedName{Namespace: "default", Name: "foo"},
Reason: "Admitted",
EventType: corev1.EventTypeNormal,
},
},
},
"workload does not get scheduled as it does not fit within the node capacity": {
nodes: defaultSingleNode,
topologies: []kueuealpha.Topology{defaultSingleLevelTopology},
Expand Down

0 comments on commit 26cc32d

Please sign in to comment.