-
Notifications
You must be signed in to change notification settings - Fork 276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TAS: KEP update for API #3237
TAS: KEP update for API #3237
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,7 +334,7 @@ the "cloud.provider.com/topology-rack" label, but in different blocks. | |
type ResourceFlavorSpec struct { | ||
... | ||
// TopologyName indicates the name of the topology for the ResourceFlavor. | ||
// topologyName indicates topology for the TAS ResourceFlavor. | ||
// When specified, it enables scraping of the topology information from the | ||
// nodes matching to the Resource Flavor node labels. | ||
// | ||
|
@@ -344,11 +344,12 @@ type ResourceFlavorSpec struct { | |
|
||
// TopologySpec defines the desired state of Topology | ||
type TopologySpec struct { | ||
// Levels defines the levels of topology. | ||
// levels define the levels of topology. | ||
// | ||
// +required | ||
// +listType=atomic | ||
// +kubebuilder:validation:MinItems=1 | ||
// +kubebuilder:validation:MaxItems=5 | ||
// +kubebuilder:validation:MaxItems=8 | ||
Levels []TopologyLevel `json:"levels,omitempty"` | ||
} | ||
|
||
|
@@ -397,18 +398,19 @@ PodTemplate level: | |
```golang | ||
const ( | ||
|
||
// This annotation indicates that a PodSet requires Topology Aware Scheduling, | ||
// and running all pods on nodes closely connected within the same level of | ||
// hierarchy is a strong requirement for scheduling the workload. | ||
// PodSetRequiredTopologyAnnotation indicates that a PodSet requires | ||
// Topology Aware Scheduling, and requires scheduling all pods on nodes | ||
// within the same topology domain corresponding to the topology level | ||
// indicated by the annotation value (e.g. within a rack or within a block). | ||
PodSetRequiredTopologyAnnotation = "kueue.x-k8s.io/podset-required-topology" | ||
|
||
// This annotation indicates that a PodSet requires Topology Aware Scheduling, | ||
// but running all pods without the same topology level is a preference rather | ||
// than requirement. | ||
// PodSetPreferredTopologyAnnotation indicates that a PodSet requires | ||
// Topology Aware Scheduling, but scheduling all pods within pods on nodes | ||
// within the same topology domain is a preference rather than requirement. | ||
// | ||
// The levels are evaluated one-by-one going up from the level indicated by | ||
// the annotation. If the PodSet cannot fit within a given topology domain | ||
// then the next topology level up is checked. If the PodSet cannot fit | ||
// then the next topology level up is considered. If the PodSet cannot fit | ||
// at the highest topology level, then it gets admitted as distributed | ||
// among multiple topology domains. | ||
PodSetPreferredTopologyAnnotation = "kueue.x-k8s.io/podset-preferred-topology" | ||
|
@@ -439,20 +441,26 @@ Job level. | |
```golang | ||
type PodSet struct { | ||
... | ||
// TopologyRequest defines the topology requested for the corresponding PodSet. | ||
// topologyRequest defines the topology request for the PodSet. | ||
// | ||
// +optional | ||
TopologyRequest *PodSetTopologyRequest `json:"topologyRequest,omitempty"` | ||
} | ||
|
||
type PodSetTopologyRequest struct { | ||
// Policy defines the policy used for TAS. Possible values are: | ||
// - Preferred set when `kueue.x-k8s.io/podset-preferred-topology` annotation is set on the Job | ||
// - Required set when `kueue.x-k8s.io/podset-required-topology` annotation is set on the Job | ||
Policy TopologyRequestPolicy `json:"policy"` | ||
|
||
// Level indicated by the `kueue.x-k8s.io/podset-preferred-topology` or | ||
// `kueue.x-k8s.io/podset-required-topology` annotation | ||
Level string `json:"level"` | ||
// required indicates the topology level required by the PodSet, as | ||
// indicated by the `kueue.x-k8s.io/podset-required-topology` PodSet | ||
// annotation. | ||
// | ||
// +optional | ||
Required *string `json:"required,omitempty"` | ||
|
||
// preferred indicates the topology level preferred by the PodSet, as | ||
// indicated by the `kueue.x-k8s.io/podset-preferred-topology` PodSet | ||
// annotation. | ||
// | ||
// +optional | ||
Preferred *string `json:"preferred,omitempty"` | ||
} | ||
``` | ||
|
||
|
@@ -463,28 +471,72 @@ at each topology level to the specific subset of nodes. | |
type PodSetAssignment struct { | ||
... | ||
|
||
// TopologyAssignment indicates the resources assigned per topology level | ||
// topologyAssignment indicates the topology assignment divided into | ||
// topology domains corresponding to the lowest level of the topology. | ||
// The assignment specifies the number of Pods to be scheduled per topology | ||
// domain and specifies the node selectors for each topology domain, in the | ||
// following way: the node selector keys are specified by the levels field | ||
// (same for all domains), and the corresponding node selector value is | ||
// specified by the domains.values subfield. | ||
// | ||
// Example: | ||
// | ||
// topologyAssignment: | ||
// levels: | ||
// - cloud.provider.com/topology-block | ||
// - cloud.provider.com/topology-rack | ||
// domains: | ||
// - values: [block-1, rack-1] | ||
// count: 4 | ||
// - values: [block-1, rack-2] | ||
// count: 2 | ||
// | ||
// Here: | ||
// - 4 Pods are to be scheduled on nodes matching the node selector: | ||
// cloud.provider.com/topology-block: block-1 | ||
// cloud.provider.com/topology-rack: rack-1 | ||
// - 2 Pods are to be scheduled on nodes matching the node selector: | ||
// cloud.provider.com/topology-block: block-1 | ||
// cloud.provider.com/topology-rack: rack-2 | ||
// | ||
// +optional | ||
TopologyAssignment *TopologyAssignment `json:"topologyAssignment,omitempty"` | ||
} | ||
|
||
type TopologyAssignment struct { | ||
// Groups contains the list of assignments split into groups corresponding | ||
// to the same topology domain at the lowest level of the hierarchy. | ||
// levels is an ordered list of keys denoting the levels of the assigned | ||
// topology (i.e. node label keys), from the highest to the lowest level of | ||
// the topology. | ||
// | ||
// +required | ||
// +listType=atomic | ||
// +kubebuilder:validation:MinItems=1 | ||
Groups []TopologyAssignmentGroup `json:"groups"` | ||
// +kubebuilder:validation:MaxItems=8 | ||
Levels []string `json:"levels"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which objectives does this field "Storing desired state" or "temporary cache for the computing by topologyUngator"? In that case, could we put this information in the internal cache instead of CRD? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is desired state to be reached by the Pods. We cannot use internal state, because then it would be lost on Kueue restart. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does that mean that the topologyUngator can not obtain levels from Topology CR when the Kueue restarts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see what you mean. Similar question was asked in this comment. It makes the implementation of TopologyUngater simpler (as it would require lookups via RF API Topology API, and dealing with changes to the object), at the very small cost in terms of object size as the number of levels is quite limited. Also, I think it is conceptually consistent with what we do here as we store the mapping between resources and flavors, rather than reading resources from the CQ API object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think that this could be resolved by the dedicated internal cache to store the relarationships between RF and Topology.
Yes, I agree with you. My doubt was that the field is just cache since we can obtain from another objects.
Uhm, I see. Indeed the levels (new field) and flavors (existing field) field is stored in the Workload status, and it seems that reduce the multiple API calls (RF -> Topology). When we graduate API version to v1beta2, we may want to restructure the Workload status field so that we can store the actual assigned flavor and topology information in a same level field. Anyway, I'm ok with adding the levels here. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
sure, it looks possible, I get the point, but I think it would complicate the implementation to maintain the cache, rather than reading from the object.
Sure, I'm ok to add it to the KEP to re-evaluate this decision.
Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
let me know if you want that to be added, though I think it is unlikely to be dropped. For example, if we read from cache the keys could be changed in the meanwhile, and then the domain assignement would be corrupted. If we know the keys at the moment of assignment we could compare them with the ones in the cache (maintained based on the Topology API) and evict the workload. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, that's true. I think that this is a trade-off between implementation costs and API maintenance costs since the dedicated cache could prevent exposing API and easily break the structure, but it has implementation costs. Anyway, I do not claim a dedicated cache now.
This could be preventented by the event based cache updating same as today cache package (/pkg/cache). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can update the alternative section in a follow up There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, sure. |
||
|
||
// domains is a list of topology assignments split by topology domains at | ||
// the lowest level of the topology. | ||
// | ||
// +required | ||
Domains []TopologyDomainAssignment `json:"domains"` | ||
} | ||
|
||
type TopologyAssignmentGroup struct { | ||
// NodeLabels constitutes the nodeSelector for a given slice of pods. It | ||
// defines values for all labels configured in the Topology.Levels. | ||
type TopologyDomainAssignment struct { | ||
// values is an ordered list of node selector values describing a topology | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be clearer if you duplicated this description in TopologyAssignment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I've updated the TopologyAssignment description to include the information on how the node selector is specified. PTAL. |
||
// domain. The values correspond to the consecutive topology levels, from | ||
// the highest to the lowest. | ||
// | ||
// +required | ||
// +listType=atomic | ||
// +kubebuilder:validation:MinItems=1 | ||
NodeLabels map[string]string `json:"nodeLabels"` | ||
// +kubebuilder:validation:MaxItems=8 | ||
Values []string `json:"values"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the Will we receive the requests except nodeSelector keys here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree, this is why I mention the node selector in the comment. We could name the field
The node selectors are create by combining only the keys (.levels field) and values (.domains.values) fields. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sorry, I meant the nodeSelector values.
In that case, How about the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
As a name it sgtm, but maybe the only potential downside is grpc size if we have many domains assigned? I guess for big workloads it could be 10k domains or more (if the lowest level is individual node). WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess that Additionally, I guess that the TAS will be used for the Top of Rack Switch (ToR) or Leaf or Spine switches instead of Nodes since the topology often needs to be considered based on the network topology, right? Let me know if you have the actual or imaginable situations where we need to construct the topology based on the Nodes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Later maybe yes, for now we just want to ensure the pods are lending on closely connected nodes. I think there could be use cases to assign at the level of nodes to ensure there is no issue with fragmentation of quota. For example, if you assign at the level of rack, it could happen that a Pod can fit in a rack, but cannot fit on an individual node. I guess it will need to depend on the user preference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, it is not much, but the overall API object size in etcd is 1.5Mi which is not much too, so the gain around 3% of the max size. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I feel that this usage beyonds the initial TAS motivations. But, I know that we received similar requests here: #3211
Yes, that's true. Here, key problem is which disadvantages should we take. I think that both selection have the below disadvantages:
Based on the fact that Workload object is for Kueue developer and not exposed to the batch users, I am leaning toward accepting |
||
|
||
// Count indicates the number of pods in a given TopologyAssignmentGroup. | ||
// count indicates the number of Pods to be scheduled in the topology | ||
// domain indicated by the values field. | ||
// | ||
// +required | ||
// +kubebuilder:validation:Minimum=1 | ||
Count int32 `json:"count"` | ||
} | ||
``` | ||
|
@@ -495,15 +547,20 @@ different values: | |
|
||
```golang | ||
const ( | ||
// TopologySchedulingGate is used to delay topology assignment for pods | ||
// once all the pods are created. | ||
// TopologySchedulingGate is used to delay scheduling of a Pod until the | ||
// nodeSelectors corresponding to the assigned topology domain are injected | ||
// into the Pod. | ||
TopologySchedulingGate = "kueue.x-k8s.io/topology" | ||
|
||
// WorkloadAnnotation indicates the name of the workload assigned. | ||
// WorkloadAnnotation is an annotation set on the Job's PodTemplate to | ||
// indicate the name of the admitted Workload corresponding to the Job. The | ||
// annotation is set when starting the Job, and removed on stopping the Job. | ||
WorkloadAnnotation = "kueue.x-k8s.io/workload" | ||
|
||
// PodSetLabel indicates the name of the PodSet in the workload | ||
PodSeLabel = "kueue.x-k8s.io/podset" | ||
// PodSetLabel is a label set on the Job's PodTemplate to indicate the name | ||
// of the PodSet of the admitted Workload corresponding to the PodTemplate. | ||
// The label is set when starting the Job, and removed on stopping the Job. | ||
PodSetLabel = "kueue.x-k8s.io/podset" | ||
) | ||
``` | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the purpose of this field?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TopologyUngater needs the information about the keys to construct full node selector when ungating pods.
This information is also present in the Topology object, but I think it is useful to also put it here.
Otherwise we need to complicate the code of the Ungater to lookup via ResourceFlavor to the Topology object, and deal with possible deletions / changes of the API objects in the meanwhile.
The list has at most 5 elements (levels), so the Workload object size should not be an issue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, put this field above domains.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done