Skip to content

Commit

Permalink
Merge pull request #345 from JimmyYang20/robot
Browse files Browse the repository at this point in the history
Add key envs for train worker in LL
  • Loading branch information
kubeedge-bot authored Aug 12, 2022
2 parents 62fbd51 + f3bcc61 commit b91cfb9
Showing 1 changed file with 46 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -542,6 +543,37 @@ func IsJobFinished(j *sednav1.LifelongLearningJob) bool {
return false
}

// isCompletedInitialTraining checks whether job has completed initial train task.
func (c *Controller) hasCompletedInitialTraining(jobConditions []sednav1.LLJobCondition) bool {
for i := 0; i < len(jobConditions); i++ {
jobCond := jobConditions[i]
if jobCond.Stage == sednav1.LLJobTrain && jobCond.Type == sednav1.LLJobStageCondCompleted {
return true
}
}
return false
}

func (c *Controller) getCloudKBIndex(jobConditions []sednav1.LLJobCondition) string {
for i := len(jobConditions) - 1; i >= 0; i-- {
jobCond := jobConditions[i]
var cond ConditionData
if jobCond.Stage == sednav1.LLJobTrain && jobCond.Type == sednav1.LLJobStageCondCompleted {
if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil {
continue
}

if cond.Output == nil || len(cond.Output.Models) == 0 {
continue
}

model := cond.Output.Models[0]
return model.GetURL()
}
}
return ""
}

func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) {
ctx := context.Background()
var podTemplate *v1.PodTemplateSpec
Expand Down Expand Up @@ -571,8 +603,10 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
return err
}

jobConditions := job.Status.Conditions

// get all url for train and eval from data in condition
condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
condDataStr := jobConditions[len(job.Status.Conditions)-1].Data
klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
var cond ConditionData
(&cond).Unmarshal([]byte(condDataStr))
Expand All @@ -598,13 +632,19 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
podTemplate = &job.Spec.TrainSpec.Template
// Env parameters for train

hasCompletedInitialTraining := c.hasCompletedInitialTraining(jobConditions)

workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace,
"JOB_NAME": job.Name,
"WORKER_NAME": "train-worker-" + utilrand.String(5),
"NAMESPACE": job.Namespace,
"JOB_NAME": job.Name,
"WORKER_NAME": "train-worker-" + utilrand.String(5),
"HAS_COMPLETED_INITIAL_TRAINING": strconv.FormatBool(hasCompletedInitialTraining),
"LC_SERVER": c.cfg.LC.Server,
"KB_SERVER": c.cfg.KB.Server,
}

"LC_SERVER": c.cfg.LC.Server,
"KB_SERVER": c.cfg.KB.Server,
if hasCompletedInitialTraining {
workerParam.Env["CLOUD_KB_INDEX"] = c.getCloudKBIndex(jobConditions)
}

workerParam.Mounts = append(workerParam.Mounts,
Expand Down

0 comments on commit b91cfb9

Please sign in to comment.