From 6f45058918d08bc8d87f734eba4251d18eb3b748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Wo=C5=BAniak?= Date: Mon, 4 Nov 2024 16:03:23 +0100 Subject: [PATCH] Review remarks Co-authored-by: David Grove Co-authored-by: Yuki Iwai --- .../concepts/topology_aware_scheduling.md | 86 ++++--------------- .../examples/tas/sample-job-preferred.yaml | 24 ++++++ site/static/examples/tas/sample-queues.yaml | 34 ++++++++ 3 files changed, 74 insertions(+), 70 deletions(-) create mode 100644 site/static/examples/tas/sample-job-preferred.yaml create mode 100644 site/static/examples/tas/sample-queues.yaml diff --git a/site/content/en/docs/concepts/topology_aware_scheduling.md b/site/content/en/docs/concepts/topology_aware_scheduling.md index aba7ea5e80..b75d15c59f 100644 --- a/site/content/en/docs/concepts/topology_aware_scheduling.md +++ b/site/content/en/docs/concepts/topology_aware_scheduling.md @@ -9,9 +9,9 @@ description: > {{< feature-state state="alpha" for_version="v0.9" >}} It is common that AI/ML workloads require a significant amount of pod-to-pod -communication, and thus the network bendwidth between the running Pods +communication. Therefore the network bandwidth between the running Pods translates into the workload execution time, and the cost of running -such workloads. Then, the connectivity between the Pods depends on the placement +such workloads. The available bandwidth between the Pods depends on the placement of the Nodes, running the Pods, in the data center. We observe that the data centers have a hierarchical structure of their @@ -25,7 +25,7 @@ blocks are more distant than two nodes within the same block. In this feature (called Topology Aware Scheduling, or TAS for short) we introduce a convention to represent the [hierarchical node topology information](#node-topology-information), and a set -of APIs for Kueue administrators and users to utilize the information in order +of APIs for Kueue administrators and users to utilize the information to optimize the Pod placement. ### Node topology information @@ -39,7 +39,7 @@ which identifies uniquely its location in the tree structure. We do not assume global uniqueness of labels on each level, i.e. there could be two nodes with the same "rack" label, but in different "blocks". -For example, this is a representation of the dataset hierarchy; +For example, this is a representation of the data center hierarchy; | node | cloud.provider.com/topology-block | cloud.provider.com/topology-rack | |:------:|:----------------------------------:|:--------------------------------:| @@ -51,6 +51,11 @@ For example, this is a representation of the dataset hierarchy; Note that, there is a pair of nodes, node-1 and node-3, with the same value of the "cloud.provider.com/topology-rack" label, but in different blocks. +{{% alert title="Note" color="primary" %}} +TAS only includes Nodes with `Ready=True` condition when aggregating the Node +capacity for scheduling in each topology domain. +{{% /alert %}} + ### Admin-facing APIs As an admin, in order to enable the feature you need to: @@ -61,48 +66,13 @@ As an admin, in order to enable the feature you need to: #### Example -```yaml -apiVersion: kueue.x-k8s.io/v1alpha1 -kind: Topology -metadata: - name: "default" -spec: - levels: - - nodeLabel: "cloud.provider.com/topology-block" - - nodeLabel: "cloud.provider.com/topology-rack" - - nodeLabel: "kubernetes.io/hostname" ---- -kind: ResourceFlavor -apiVersion: kueue.x-k8s.io/v1beta1 -metadata: - name: "tas-flavor" -spec: - nodeLabels: - cloud.provider.com/: "tas-node-group" - topologyName: "default" ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: "tas-cluster-queue" -spec: - namespaceSelector: {} # match all. - resourceGroups: - - coveredResources: ["cpu", "memory"] - flavors: - - name: "tas-flavor" - resources: - - name: "cpu" - nominalQuota: 100 - - name: "memory" - nominalQuota: 100Gi -``` +{{< include "examples/tas/sample-queues.yaml" "yaml" >}} ### User-facing APIs Once TAS is configured and ready to be used, you can create Jobs with the following annotations set at the PodTemplate level: -- `kueue.x-k8s.io/podset-required-topology` - indicates that a PodSet requires +- `kueue.x-k8s.io/podset-preferred-topology` - indicates that a PodSet requires Topology Aware Scheduling, but scheduling all pods within pods on nodes within the same topology domain is a preference rather than requirement. The levels are evaluated one-by-one going up from the level indicated by @@ -121,32 +91,7 @@ Here is an example Job a user might submit to use TAS. It assumes there exists a LocalQueue named `tas-user-queue` which refernces the ClusterQueue pointing to a TAS ResourceFlavor. -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - generateName: tas-sample-big-preferred-host - labels: - kueue.x-k8s.io/queue-name: tas-user-queue -spec: - parallelism: 40 - completions: 40 - completionMode: Indexed - template: - metadata: - annotations: - kueue.x-k8s.io/podset-preferred-topology: "cloud.provider.com/topology-block" - spec: - containers: - - name: dummy-job - image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0 - args: ["300s"] - resources: - requests: - cpu: "1" - memory: "200Mi" - restartPolicy: Never -``` +{{< include "examples/tas/sample-job-preferred.yaml" "yaml" >}} ### Limitations @@ -154,9 +99,10 @@ Currently, there are multiple limitations for the compatibility of the feature with other features. In particular, a ClusterQueue referencing a TAS Resource Flavor (with the `.spec.topologyName` field) is marked as inactive in the following scenarios: -- the CQ is in cohort -- the CQ is using preemption -- the CQ is using MultiKueue or ProvisioningRequest admission checks +- the CQ is in cohort (`.spec.cohort` is set) +- the CQ is using [preemption](preemption.md) +- the CQ is using [MultiKueue](multikueue.md) or + [ProvisioningRequest](/docs/admission-check-controllers/provisioning/) admission checks These usage scenarios are considered to be supported in the future releases of Kueue. diff --git a/site/static/examples/tas/sample-job-preferred.yaml b/site/static/examples/tas/sample-job-preferred.yaml new file mode 100644 index 0000000000..3a0674bfce --- /dev/null +++ b/site/static/examples/tas/sample-job-preferred.yaml @@ -0,0 +1,24 @@ +apiVersion: batch/v1 +kind: Job +metadata: + generateName: tas-sample-preferred + labels: + kueue.x-k8s.io/queue-name: tas-user-queue +spec: + parallelism: 40 + completions: 40 + completionMode: Indexed + template: + metadata: + annotations: + kueue.x-k8s.io/podset-preferred-topology: "cloud.provider.com/topology-block" + spec: + containers: + - name: dummy-job + image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0 + args: ["300s"] + resources: + requests: + cpu: "1" + memory: "200Mi" + restartPolicy: Never \ No newline at end of file diff --git a/site/static/examples/tas/sample-queues.yaml b/site/static/examples/tas/sample-queues.yaml new file mode 100644 index 0000000000..56c1ae68db --- /dev/null +++ b/site/static/examples/tas/sample-queues.yaml @@ -0,0 +1,34 @@ +apiVersion: kueue.x-k8s.io/v1alpha1 +kind: Topology +metadata: + name: "default" +spec: + levels: + - nodeLabel: "cloud.provider.com/topology-block" + - nodeLabel: "cloud.provider.com/topology-rack" + - nodeLabel: "kubernetes.io/hostname" +--- +kind: ResourceFlavor +apiVersion: kueue.x-k8s.io/v1beta1 +metadata: + name: "tas-flavor" +spec: + nodeLabels: + cloud.provider.com/: "tas-node-group" + topologyName: "default" +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "tas-cluster-queue" +spec: + namespaceSelector: {} # match all. + resourceGroups: + - coveredResources: ["cpu", "memory"] + flavors: + - name: "tas-flavor" + resources: + - name: "cpu" + nominalQuota: 100 + - name: "memory" + nominalQuota: 100Gi \ No newline at end of file