Skip to content

Commit

Permalink
[Kjobctl] Extend priority flag to other modes (#3213)
Browse files Browse the repository at this point in the history
* extend priority flag to other modes

* Update after code review

* Unify priority flag for all modes

* Move priority flag check to common place
  • Loading branch information
mszadkow authored Oct 19, 2024
1 parent 8b0d7f3 commit 20c83fb
Show file tree
Hide file tree
Showing 14 changed files with 319 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ type TemplateReference string
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('ntasks' in self.requiredFlags) || self.name == 'Slurm'", message="ntasks flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('output' in self.requiredFlags) || self.name == 'Slurm'", message="output flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('partition' in self.requiredFlags) || self.name == 'Slurm'", message="partition flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('priority' in self.requiredFlags) || self.name == 'Slurm'", message="priority flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || self.name != 'Slurm' || !('parallelism' in self.requiredFlags)", message="parallelism flag can't be used on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || self.name != 'Slurm' || !('completions' in self.requiredFlags)", message="completions flag can't be used on Slurm mode"
type SupportedMode struct {
Expand Down Expand Up @@ -121,9 +120,10 @@ type SupportedMode struct {
// The raycluster flag used only for the RayJob mode.
// The request flag used only for Interactive and Job modes.
// The cmd flag used only for Interactive, Job, and RayJob.
// The skip-priority-workload and priority flags can be used in all modes.
// If the raycluster flag are set, none of localqueue, replicas, min-replicas, or max-replicas can be set.
// For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem, mem-per-cpu,
// mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue, priority.
// mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue.
//
// cmd and requests values are going to be added only to the first primary container.
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ spec:
The raycluster flag used only for the RayJob mode.
The request flag used only for Interactive and Job modes.
The cmd flag used only for Interactive, Job, and RayJob.
The skip-priority-workload and priority flags can be used in all modes.
If the raycluster flag are set, none of localqueue, replicas, min-replicas, or max-replicas can be set.
For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem, mem-per-cpu,
mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue, priority.
mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue.
cmd and requests values are going to be added only to the first primary container.
items:
Expand Down Expand Up @@ -183,9 +184,6 @@ spec:
- message: partition flag can be used only on Slurm mode
rule: '!has(self.requiredFlags) || !(''partition'' in self.requiredFlags)
|| self.name == ''Slurm'''
- message: priority flag can be used only on Slurm mode
rule: '!has(self.requiredFlags) || !(''priority'' in self.requiredFlags)
|| self.name == ''Slurm'''
- message: parallelism flag can't be used on Slurm mode
rule: '!has(self.requiredFlags) || self.name != ''Slurm'' || !(''parallelism''
in self.requiredFlags)'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The file is auto-generated from the Go source code of the component using the
Create an interactive shell

```
kjobctl create interactive --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--cmd COMMAND] [--request RESOURCE_NAME=QUANTITY] [--pod-running-timeout DURATION] [--rm]
kjobctl create interactive --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--priority NAME] [--skip-priority-validation] [--cmd COMMAND] [--request RESOURCE_NAME=QUANTITY] [--pod-running-timeout DURATION] [--rm]
```


Expand Down Expand Up @@ -99,6 +99,15 @@ kjobctl create interactive --profile APPLICATION_PROFILE_NAME [--localqueue LOCA
<p>The length of time (like 5s, 2m, or 3h, higher than zero) to wait until at least one pod is running.</p>
</td>
</tr>
<tr>
<td colspan="2">--priority string</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Apply priority for the entire workload.</p>
</td>
</tr>
<tr>
<td colspan="2">-p, --profile string</td>
</tr>
Expand Down Expand Up @@ -144,6 +153,15 @@ kjobctl create interactive --profile APPLICATION_PROFILE_NAME [--localqueue LOCA
<p>Skip local queue validation. Add local queue even if the queue does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--skip-priority-validation</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Skip workload priority class validation. Add priority class label even if the class does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--template string</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The file is auto-generated from the Go source code of the component using the
Create a job

```
kjobctl create job --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--cmd COMMAND] [--request RESOURCE_NAME=QUANTITY] [--parallelism PARALLELISM] [--completions COMPLETIONS]
kjobctl create job --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--priority NAME] [--skip-priority-validation] [--cmd COMMAND] [--request RESOURCE_NAME=QUANTITY] [--parallelism PARALLELISM] [--completions COMPLETIONS]
```


Expand Down Expand Up @@ -111,6 +111,15 @@ kjobctl create job --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_
<p>Parallelism specifies the maximum desired number of pods the job should run at any given time.</p>
</td>
</tr>
<tr>
<td colspan="2">--priority string</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Apply priority for the entire workload.</p>
</td>
</tr>
<tr>
<td colspan="2">-p, --profile string</td>
</tr>
Expand Down Expand Up @@ -147,6 +156,15 @@ kjobctl create job --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_
<p>Skip local queue validation. Add local queue even if the queue does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--skip-priority-validation</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Skip workload priority class validation. Add priority class label even if the class does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--template string</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Create a raycluster.
KubeRay operator is required for RayCluster. How to install KubeRay operator you can find here https://ray-project.github.io/kuberay/deploy/installation/.

```
kjobctl create raycluster --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--replicas [WORKER_GROUP]=REPLICAS] [--min-replicas [WORKER_GROUP]=MIN_REPLICAS] [--max-replicas [WORKER_GROUP]=MAX_REPLICAS]
kjobctl create raycluster --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--priority NAME] [--skip-priority-validation] [--replicas [WORKER_GROUP]=REPLICAS] [--min-replicas [WORKER_GROUP]=MIN_REPLICAS] [--max-replicas [WORKER_GROUP]=MAX_REPLICAS]
```


Expand Down Expand Up @@ -103,6 +103,15 @@ kjobctl create raycluster --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL
<p>Output format. One of: (json, yaml, name, go-template, go-template-file, template, templatefile, jsonpath, jsonpath-as-json, jsonpath-file).</p>
</td>
</tr>
<tr>
<td colspan="2">--priority string</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Apply priority for the entire workload.</p>
</td>
</tr>
<tr>
<td colspan="2">-p, --profile string</td>
</tr>
Expand Down Expand Up @@ -139,6 +148,15 @@ kjobctl create raycluster --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL
<p>Skip local queue validation. Add local queue even if the queue does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--skip-priority-validation</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Skip workload priority class validation. Add priority class label even if the class does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--template string</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Create a rayjob.
KubeRay operator is required for RayJob. How to install KubeRay operator you can find here https://ray-project.github.io/kuberay/deploy/installation/.

```
kjobctl create rayjob --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--cmd COMMAND] [--replicas [WORKER_GROUP]=REPLICAS] [--min-replicas [WORKER_GROUP]=MIN_REPLICAS] [--max-replicas [WORKER_GROUP]=MAX_REPLICAS]
kjobctl create rayjob --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--priority NAME] [--skip-priority-validation] [--cmd COMMAND] [--replicas [WORKER_GROUP]=REPLICAS] [--min-replicas [WORKER_GROUP]=MIN_REPLICAS] [--max-replicas [WORKER_GROUP]=MAX_REPLICAS]
```


Expand Down Expand Up @@ -113,6 +113,15 @@ kjobctl create rayjob --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUE
<p>Output format. One of: (json, yaml, name, go-template, go-template-file, template, templatefile, jsonpath, jsonpath-as-json, jsonpath-file).</p>
</td>
</tr>
<tr>
<td colspan="2">--priority string</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Apply priority for the entire workload.</p>
</td>
</tr>
<tr>
<td colspan="2">-p, --profile string</td>
</tr>
Expand Down Expand Up @@ -158,6 +167,15 @@ kjobctl create rayjob --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUE
<p>Skip local queue validation. Add local queue even if the queue does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--skip-priority-validation</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Skip workload priority class validation. Add priority class label even if the class does not exist.</p>
</td>
</tr>
<tr>
<td colspan="2">--template string</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The file is auto-generated from the Go source code of the component using the
Create a slurm job

```
kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--ignore-unknown-flags] [--skip-priority-validation] [--init-image IMAGE] [--first-node-ip] [--first-node-ip-timeout DURATION] -- [--array ARRAY] [--cpus-per-task QUANTITY] [--gpus-per-task QUANTITY] [--mem QUANTITY] [--mem-per-task QUANTITY] [--mem-per-cpu QUANTITY] [--mem-per-gpu QUANTITY] [--nodes COUNT] [--ntasks COUNT] [--output FILENAME_PATTERN] [--error FILENAME_PATTERN] [--input FILENAME_PATTERN] [--job-name NAME] [--partition NAME] [--priority NAME] SCRIPT
kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--priority NAME] [--skip-priority-validation] [--ignore-unknown-flags] [--init-image IMAGE] [--first-node-ip] [--first-node-ip-timeout DURATION] -- [--array ARRAY] [--cpus-per-task QUANTITY] [--gpus-per-task QUANTITY] [--mem QUANTITY] [--mem-per-task QUANTITY] [--mem-per-cpu QUANTITY] [--mem-per-gpu QUANTITY] [--nodes COUNT] [--ntasks COUNT] [--output FILENAME_PATTERN] [--error FILENAME_PATTERN] [--input FILENAME_PATTERN] [--job-name NAME] [--partition NAME] SCRIPT
```


Expand Down Expand Up @@ -115,6 +115,15 @@ kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEU
<p>Output format. One of: (json, yaml, name, go-template, go-template-file, template, templatefile, jsonpath, jsonpath-as-json, jsonpath-file).</p>
</td>
</tr>
<tr>
<td colspan="2">--priority string</td>
</tr>
<tr>
<td></td>
<td style="line-height: 130%; word-wrap: break-word;">
<p>Apply priority for the entire workload.</p>
</td>
</tr>
<tr>
<td colspan="2">-p, --profile string</td>
</tr>
Expand Down
12 changes: 12 additions & 0 deletions cmd/experimental/kjobctl/pkg/builder/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,14 @@ func (b *Builder) validateGeneral(ctx context.Context) error {
}
}

// check that priority class exists
if len(b.priority) != 0 && !b.skipPriorityValidation {
_, err := b.kueueClientset.KueueV1beta1().WorkloadPriorityClasses().Get(ctx, b.priority, metav1.GetOptions{})
if err != nil {
return err
}
}

return nil
}

Expand Down Expand Up @@ -533,6 +541,10 @@ func (b *Builder) buildObjectMeta(templateObjectMeta metav1.ObjectMeta) metav1.O
objectMeta.Labels[kueueconstants.QueueLabel] = b.localQueue
}

if len(b.priority) != 0 {
objectMeta.Labels[kueueconstants.WorkloadPriorityClassLabel] = b.priority
}

return objectMeta
}

Expand Down
17 changes: 2 additions & 15 deletions cmd/experimental/kjobctl/pkg/builder/slurm_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ import (
"k8s.io/apimachinery/pkg/runtime"
utilrand "k8s.io/apimachinery/pkg/util/rand"
"k8s.io/utils/ptr"
kueue "sigs.k8s.io/kueue/pkg/controller/constants"

"sigs.k8s.io/kueue/cmd/experimental/kjobctl/apis/v1alpha1"
"sigs.k8s.io/kueue/cmd/experimental/kjobctl/pkg/parser"
Expand Down Expand Up @@ -116,7 +115,7 @@ type slurmBuilder struct {

var _ builder = (*slurmBuilder)(nil)

func (b *slurmBuilder) validateGeneral(ctx context.Context) error {
func (b *slurmBuilder) validateGeneral() error {
if len(b.script) == 0 {
return noScriptSpecifiedErr
}
Expand All @@ -129,14 +128,6 @@ func (b *slurmBuilder) validateGeneral(ctx context.Context) error {
return noGpusPerTaskSpecifiedErr
}

// check that priority class exists
if len(b.priority) != 0 && !b.skipPriorityValidation {
_, err := b.kueueClientset.KueueV1beta1().WorkloadPriorityClasses().Get(ctx, b.priority, metav1.GetOptions{})
if err != nil {
return err
}
}

return nil
}

Expand Down Expand Up @@ -213,7 +204,7 @@ func (b *slurmBuilder) validateMutuallyExclusiveFlags() error {
}

func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Object, error) {
if err := b.validateGeneral(ctx); err != nil {
if err := b.validateGeneral(); err != nil {
return nil, nil, err
}

Expand All @@ -239,10 +230,6 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
job.Spec.Template.Spec.Subdomain = b.objectName

if len(b.priority) != 0 {
job.Labels[kueue.WorkloadPriorityClassLabel] = b.priority
}

b.buildPodSpecVolumesAndEnv(&job.Spec.Template.Spec)
job.Spec.Template.Spec.Volumes = append(job.Spec.Template.Spec.Volumes,
corev1.Volume{
Expand Down
14 changes: 7 additions & 7 deletions cmd/experimental/kjobctl/pkg/cmd/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ var createModeSubcommands = map[string]modeSubcommand{
ModeName: v1alpha1.SlurmMode,
Setup: func(clientGetter util.ClientGetter, subcmd *cobra.Command, o *CreateOptions) {
subcmd.Use += " [--ignore-unknown-flags]" +
" [--skip-priority-validation]" +
" [--init-image IMAGE]" +
" [--first-node-ip]" +
" [--first-node-ip-timeout DURATION]" +
Expand All @@ -349,7 +348,6 @@ var createModeSubcommands = map[string]modeSubcommand{
" [--input FILENAME_PATTERN]" +
" [--job-name NAME]" +
" [--partition NAME]" +
" [--priority NAME]" +
" SCRIPT"

subcmd.Short = "Create a slurm job"
Expand All @@ -360,8 +358,6 @@ var createModeSubcommands = map[string]modeSubcommand{
"Ignore all the unsupported flags in the bash script.")
subcmd.Flags().StringVar(&o.InitImage, initImageFlagName, "registry.k8s.io/busybox:1.27.2",
"The image used for the init container.")
subcmd.Flags().BoolVar(&o.SkipPriorityValidation, skipPriorityValidationFlagName, false,
"Skip workload priority class validation. Add priority class label even if the class does not exist.")
subcmd.Flags().BoolVar(&o.FirstNodeIP, firstNodeIPFlagName, false,
"Enable the retrieval of the first node's IP address.")
subcmd.Flags().DurationVar(&o.FirstNodeIPTimeout, firstNodeIPTimeoutFlagName, time.Minute,
Expand Down Expand Up @@ -400,8 +396,6 @@ The minimum index value is 0. The maximum index value is 2147483647.`)
"What is the job name.")
o.SlurmFlagSet.StringVar(&o.Partition, partitionFlagName, "",
"Local queue name.")
o.SlurmFlagSet.StringVar(&o.Priority, priorityFlagName, "",
"Apply priority for the entire workload.")
o.SlurmFlagSet.StringVarP(&o.ChangeDir, changeDirFlagName, "D", "",
"Change directory before executing the script.")
},
Expand All @@ -428,7 +422,9 @@ func NewCreateCmd(clientGetter util.ClientGetter, streams genericiooptions.IOStr
Use: modeName +
" --profile APPLICATION_PROFILE_NAME" +
" [--localqueue LOCAL_QUEUE_NAME]" +
" [--skip-localqueue-validation]",
" [--skip-localqueue-validation]" +
" [--priority NAME]" +
" [--skip-priority-validation]",
DisableFlagsInUseLine: true,
Args: cobra.NoArgs,
RunE: func(cmd *cobra.Command, args []string) error {
Expand All @@ -451,6 +447,10 @@ func NewCreateCmd(clientGetter util.ClientGetter, streams genericiooptions.IOStr
"Kueue localqueue name which is associated with the resource.")
subcmd.Flags().BoolVar(&o.SkipLocalQueueValidation, skipLocalQueueValidationFlagName, false,
"Skip local queue validation. Add local queue even if the queue does not exist.")
subcmd.Flags().StringVar(&o.Priority, priorityFlagName, "",
"Apply priority for the entire workload.")
subcmd.Flags().BoolVar(&o.SkipPriorityValidation, skipPriorityValidationFlagName, false,
"Skip workload priority class validation. Add priority class label even if the class does not exist.")

modeSubcommand.Setup(clientGetter, subcmd, o)

Expand Down
Loading

0 comments on commit 20c83fb

Please sign in to comment.