kubeflow · google-oss-prow · Jun 26, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/deploy/v2beta1/mpi-operator.yaml b/deploy/v2beta1/mpi-operator.yaml
@@ -55,6 +55,11 @@ spec:
             type: object
           spec:
             properties:
+              launcherCreationPolicy:
+                description: launcherCreationPolicy if WaitForWorkersReady, the launcher
+                  is created only after all workers are in Ready state. Defaults to
+                  AtStartup.
+                type: string
               mpiImplementation:
                 default: OpenMPI
                 description: MPIImplementation is the MPI implementation. Options

diff --git a/manifests/base/kubeflow.org_mpijobs.yaml b/manifests/base/kubeflow.org_mpijobs.yaml
@@ -32,6 +32,11 @@ spec:
             type: object
           spec:
             properties:
+              launcherCreationPolicy:
+                description: launcherCreationPolicy if WaitForWorkersReady, the launcher
+                  is created only after all workers are in Ready state. Defaults to
+                  AtStartup.
+                type: string
               mpiImplementation:
                 default: OpenMPI
                 description: MPIImplementation is the MPI implementation. Options

diff --git a/pkg/apis/kubeflow/v2beta1/default.go b/pkg/apis/kubeflow/v2beta1/default.go
@@ -68,6 +68,9 @@ func SetDefaults_MPIJob(mpiJob *MPIJob) {
 	if mpiJob.Spec.MPIImplementation == "" {
 		mpiJob.Spec.MPIImplementation = MPIImplementationOpenMPI
 	}
+	if mpiJob.Spec.LauncherCreationPolicy == "" {
+		mpiJob.Spec.LauncherCreationPolicy = LauncherCreationPolicyAtStartup
+	}
 
 	// set default to Launcher
 	setDefaultsTypeLauncher(mpiJob.Spec.MPIReplicaSpecs[MPIReplicaTypeLauncher])

diff --git a/pkg/apis/kubeflow/v2beta1/default_test.go b/pkg/apis/kubeflow/v2beta1/default_test.go
@@ -33,8 +33,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 					RunPolicy: RunPolicy{
 						CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyNone),
 					},
-					SSHAuthMountPath:  "/root/.ssh",
-					MPIImplementation: MPIImplementationOpenMPI,
+					SSHAuthMountPath:       "/root/.ssh",
+					MPIImplementation:      MPIImplementationOpenMPI,
+					LauncherCreationPolicy: "AtStartup",
 				},
 			},
 		},
@@ -48,8 +49,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 						ActiveDeadlineSeconds:   newInt64(3),
 						BackoffLimit:            newInt32(4),
 					},
-					SSHAuthMountPath:  "/home/mpiuser/.ssh",
-					MPIImplementation: MPIImplementationIntel,
+					SSHAuthMountPath:       "/home/mpiuser/.ssh",
+					MPIImplementation:      MPIImplementationIntel,
+					LauncherCreationPolicy: "AtStartup",
 				},
 			},
 			want: MPIJob{
@@ -61,8 +63,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 						ActiveDeadlineSeconds:   newInt64(3),
 						BackoffLimit:            newInt32(4),
 					},
-					SSHAuthMountPath:  "/home/mpiuser/.ssh",
-					MPIImplementation: MPIImplementationIntel,
+					SSHAuthMountPath:       "/home/mpiuser/.ssh",
+					MPIImplementation:      MPIImplementationIntel,
+					LauncherCreationPolicy: "AtStartup",
 				},
 			},
 		},
@@ -76,8 +79,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 						ActiveDeadlineSeconds:   newInt64(3),
 						BackoffLimit:            newInt32(4),
 					},
-					SSHAuthMountPath:  "/home/mpiuser/.ssh",
-					MPIImplementation: MPIImplementationMPICH,
+					SSHAuthMountPath:       "/home/mpiuser/.ssh",
+					MPIImplementation:      MPIImplementationMPICH,
+					LauncherCreationPolicy: "AtStartup",
 				},
 			},
 			want: MPIJob{
@@ -89,8 +93,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 						ActiveDeadlineSeconds:   newInt64(3),
 						BackoffLimit:            newInt32(4),
 					},
-					SSHAuthMountPath:  "/home/mpiuser/.ssh",
-					MPIImplementation: MPIImplementationMPICH,
+					SSHAuthMountPath:       "/home/mpiuser/.ssh",
+					MPIImplementation:      MPIImplementationMPICH,
+					LauncherCreationPolicy: "AtStartup",
 				},
 			},
 		},
@@ -108,8 +113,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 					RunPolicy: RunPolicy{
 						CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyNone),
 					},
-					SSHAuthMountPath:  "/root/.ssh",
-					MPIImplementation: MPIImplementationOpenMPI,
+					SSHAuthMountPath:       "/root/.ssh",
+					MPIImplementation:      MPIImplementationOpenMPI,
+					LauncherCreationPolicy: "AtStartup",
 					MPIReplicaSpecs: map[MPIReplicaType]*common.ReplicaSpec{
 						MPIReplicaTypeLauncher: {
 							Replicas:      newInt32(1),
@@ -133,8 +139,9 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 					RunPolicy: RunPolicy{
 						CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyNone),
 					},
-					SSHAuthMountPath:  "/root/.ssh",
-					MPIImplementation: MPIImplementationOpenMPI,
+					SSHAuthMountPath:       "/root/.ssh",
+					MPIImplementation:      MPIImplementationOpenMPI,
+					LauncherCreationPolicy: "AtStartup",
 					MPIReplicaSpecs: map[MPIReplicaType]*common.ReplicaSpec{
 						MPIReplicaTypeWorker: {
 							Replicas:      newInt32(0),

diff --git a/pkg/apis/kubeflow/v2beta1/openapi_generated.go b/pkg/apis/kubeflow/v2beta1/openapi_generated.go
diff --git a/pkg/apis/kubeflow/v2beta1/swagger.json b/pkg/apis/kubeflow/v2beta1/swagger.json
@@ -321,6 +321,10 @@
         "mpiReplicaSpecs"
       ],
       "properties": {
+        "launcherCreationPolicy": {
+          "description": "launcherCreationPolicy if WaitForWorkersReady, the launcher is created only after all workers are in Ready state. Defaults to AtStartup.",
+          "type": "string"
+        },
         "mpiImplementation": {
           "description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
           "type": "string"

diff --git a/pkg/apis/kubeflow/v2beta1/types.go b/pkg/apis/kubeflow/v2beta1/types.go
@@ -134,6 +134,13 @@ type RunPolicy struct {
 	Suspend *bool `json:"suspend,omitempty"`
 }
 
+type LauncherCreationPolicy string
+
+const (
+	LauncherCreationPolicyAtStartup           = "AtStartup"
+	LauncherCreationPolicyWaitForWorkersReady = "WaitForWorkersReady"
+)
+
 type MPIJobSpec struct {
 
 	// Specifies the number of slots per worker used in hostfile.
@@ -154,6 +161,9 @@ type MPIJobSpec struct {
 	// +kubebuilder:default:="/root/.ssh"
 	SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
 
+	// launcherCreationPolicy if WaitForWorkersReady, the launcher is created only after all workers are in Ready state. Defaults to AtStartup.
+	LauncherCreationPolicy LauncherCreationPolicy `json:"launcherCreationPolicy,omitempty"`
+
 	// MPIImplementation is the MPI implementation.
 	// Options are "OpenMPI" (default), "Intel" and "MPICH".
 	// +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH

diff --git a/pkg/controller/mpi_job_controller.go b/pkg/controller/mpi_job_controller.go
@@ -624,10 +624,14 @@ func (c *MPIJobController) syncHandler(key string) error {
 			}
 		}
 		if launcher == nil {
-			launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
-			if err != nil {
-				c.recorder.Eventf(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, "launcher pod created failed: %v", err)
-				return fmt.Errorf("creating launcher Pod: %w", err)
+			if mpiJob.Spec.LauncherCreationPolicy == kubeflow.LauncherCreationPolicyAtStartup || c.countReadyWorkerPods(worker) == len(worker) {
+				launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
+				if err != nil {
+					c.recorder.Eventf(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, "launcher pod created failed: %v", err)
+					return fmt.Errorf("creating launcher Pod: %w", err)
+				}
+			} else {
+				klog.V(4).Infof("Waiting for workers %s/%s to start.", mpiJob.Namespace, mpiJob.Name)
 			}
 		}
 	}
@@ -776,6 +780,19 @@ func (c *MPIJobController) getRunningWorkerPods(mpiJob *kubeflow.MPIJob) ([]*cor
 	return podList, nil
 }
 
+func (c *MPIJobController) countReadyWorkerPods(workers []*corev1.Pod) int {
+	ready := 0
+	for _, pod := range workers {
+		for _, c := range pod.Status.Conditions {
+			if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue {
+				ready++
-				ready++
+				ready++
+				break
-				ready++
+				ready++
+				break
+				break
+			}
+		}
+	}
+	return ready
+}
+
 // getOrCreateConfigMap gets the ConfigMap controlled by this MPIJob, or creates
 // one if it doesn't exist.
 func (c *MPIJobController) getOrCreateConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.ConfigMap, error) {
@@ -1011,14 +1028,15 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
 			mpiJob.Status.StartTime = &now
 		}
 	}
-	launcherPods, err := c.jobPods(launcher)
-	if err != nil {
-		return fmt.Errorf("checking launcher pods running: %w", err)
-	}
-	// Job.status.Active accounts for Pending and Running pods. Count running pods
-	// from the lister instead.
-	launcherPodsCnt := countRunningPods(launcherPods)
+	launcherPodsCnt := 0
 	if launcher != nil {
+		launcherPods, err := c.jobPods(launcher)
+		if err != nil {
+			return fmt.Errorf("checking launcher pods running: %w", err)
+		}
+		// Job.status.Active accounts for Pending and Running pods. Count running pods
+		// from the lister instead.
+		launcherPodsCnt = countRunningPods(launcherPods)
 		initializeMPIJobStatuses(mpiJob, kubeflow.MPIReplicaTypeLauncher)
 		launcherStatus := mpiJob.Status.ReplicaStatuses[kubeflow.MPIReplicaTypeLauncher]
 		launcherStatus.Failed = launcher.Status.Failed

diff --git a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md
diff --git a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py
diff --git a/test/e2e/mpi_job_test.go b/test/e2e/mpi_job_test.go
@@ -216,7 +216,7 @@ var _ = ginkgo.Describe("MPIJob", func() {
 		})
 
 		ginkgo.When("running as non-root", func() {
-			ginkgo.BeforeEach(func () {
+			ginkgo.BeforeEach(func() {
 				mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
 
 				mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{
@@ -283,7 +283,7 @@ var _ = ginkgo.Describe("MPIJob", func() {
 		})
 
 		ginkgo.When("running as non-root", func() {
-			ginkgo.BeforeEach(func () {
+			ginkgo.BeforeEach(func() {
 				mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
 
 				mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{