Rebase waitforworkers option

Signed-off-by: Lukas Hejtmanek <[email protected]>
kubeflow · Apr 4, 2023 · f975ef5 · f975ef5
1 parent c945435
commit f975ef5
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 14 deletions.
diff --git a/deploy/v2beta1/mpi-operator.yaml b/deploy/v2beta1/mpi-operator.yaml
@@ -189,6 +189,8 @@ spec:
                 type: integer
               sshAuthMountPath:
                 type: string
+              waitForWorkers:
+                type: boolean
             type: object
           status:
             properties:

diff --git a/manifests/base/crd.yaml b/manifests/base/crd.yaml
@@ -125,6 +125,8 @@ spec:
                     description: "Specifies the number of retries before marking the launcher Job as failed. Defaults to 6."
               sshAuthMountPath:
                 type: string
+              waitForWorkers:
+                type: boolean
               mpiImplementation:
                 type: string
                 enum: ["OpenMPI", "Intel"]

diff --git a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py
diff --git a/v2/crd/kubeflow.org_mpijobs.yaml b/v2/crd/kubeflow.org_mpijobs.yaml
@@ -6820,6 +6820,10 @@ spec:
                 description: SSHAuthMountPath is the directory where SSH keys are
                   mounted.
                 type: string
+              waitForWorkers:
+                default: false
+                description: Spawn launcher only after all workers are in Ready state.
+                type: boolean
             required:
             - mpiReplicaSpecs
             type: object

diff --git a/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go b/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go
diff --git a/v2/pkg/apis/kubeflow/v2beta1/swagger.json b/v2/pkg/apis/kubeflow/v2beta1/swagger.json
@@ -245,6 +245,10 @@
         "sshAuthMountPath": {
           "description": "SSHAuthMountPath is the directory where SSH keys are mounted. Defaults to \"/root/.ssh\".",
           "type": "string"
+        },
+        "waitForWorkers": {
+          "description": "Spawn launcher pod only after all worker pods are in Ready state. Defaults to false.",
+          "type": "boolean"
         }
       }
     }

diff --git a/v2/pkg/apis/kubeflow/v2beta1/types.go b/v2/pkg/apis/kubeflow/v2beta1/types.go
@@ -55,6 +55,9 @@ type MPIJobSpec struct {
 	// +kubebuilder:default:="/root/.ssh"
 	SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
 
+	// WaitForWorkers if true, the launcher is created only after all workers are in Ready state
+	WaitForWorkers bool `json:"waitForWorkers,omitempty"`
+
 	// MPIImplementation is the MPI implementation.
 	// Options are "OpenMPI" (default) and "Intel".
 	// +kubebuilder:validation:Enum:=OpenMPI;Intel

diff --git a/v2/pkg/controller/mpi_job_controller.go b/v2/pkg/controller/mpi_job_controller.go
@@ -570,10 +570,14 @@ func (c *MPIJobController) syncHandler(key string) error {
 			}
 		}
 		if launcher == nil {
-			launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
-			if err != nil {
-				c.recorder.Eventf(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, "launcher pod created failed: %v", err)
-				return fmt.Errorf("creating launcher Pod: %w", err)
+			if !mpiJob.Spec.WaitForWorkers || c.countReadyWorkerPods(worker) == len(worker) {
+				launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
+				if err != nil {
+					c.recorder.Eventf(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, "launcher pod created failed: %v", err)
+					return fmt.Errorf("creating launcher Pod: %w", err)
+				}
+			} else {
+				klog.V(4).Infof("Waiting for workers %s/%s to start.", mpiJob.Namespace, mpiJob.Name)
 			}
 		}
 	}
@@ -687,6 +691,18 @@ func (c *MPIJobController) getRunningWorkerPods(mpiJob *kubeflow.MPIJob) ([]*cor
 	return podList, nil
 }
 
+func (c *MPIJobController) countReadyWorkerPods(workers []*corev1.Pod) (int) {
+	ready := 0
+	for _, pod := range workers {
+		for _, c := range pod.Status.Conditions {
+			if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue {
+				ready++
+			}
+		}
+	}
+	return ready
+}
+
 // getOrCreateConfigMap gets the ConfigMap controlled by this MPIJob, or creates
 // one if it doesn't exist.
 func (c *MPIJobController) getOrCreateConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.ConfigMap, error) {
@@ -901,14 +917,15 @@ func (c *MPIJobController) deleteWorkerPods(mpiJob *kubeflow.MPIJob) error {
 
 func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher *batchv1.Job, worker []*corev1.Pod) error {
 	oldStatus := mpiJob.Status.DeepCopy()
-	launcherPods, err := c.jobPods(launcher)
-	if err != nil {
-		return fmt.Errorf("checking launcher pods running: %w", err)
-	}
-	// Job.status.Active accounts for Pending and Running pods. Count running pods
-	// from the lister instead.
-	launcherPodsCnt := countRunningPods(launcherPods)
+	launcherPodsCnt := 0
 	if launcher != nil {
+		launcherPods, err := c.jobPods(launcher)
+		if err != nil {
+			return fmt.Errorf("checking launcher pods running: %w", err)
+		}
+		// Job.status.Active accounts for Pending and Running pods. Count running pods
+		// from the lister instead.
+		launcherPodsCnt := countRunningPods(launcherPods)
 		initializeMPIJobStatuses(mpiJob, kubeflow.MPIReplicaTypeLauncher)
 		launcherStatus := mpiJob.Status.ReplicaStatuses[common.ReplicaType(kubeflow.MPIReplicaTypeLauncher)]
 		launcherStatus.Failed = launcher.Status.Failed