diff --git a/go.mod b/go.mod index 65586161be..d37dd03f2f 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( github.com/openshift/client-go v0.0.0-20210112165513-ebc401615f47 github.com/pborman/uuid v1.2.0 github.com/pierrec/lz4 v2.5.2+incompatible // indirect - github.com/portworx/kdmp v0.4.1-0.20211117114335-623768a845fc + github.com/portworx/kdmp v0.4.1-0.20211119130352-c09d94d769d7 github.com/portworx/sched-ops v1.20.4-rc1.0.20211116074603-2b6905763b23 github.com/portworx/torpedo v0.20.4-rc1.0.20210325154352-eb81b0cdd145 github.com/prometheus/client_golang v1.9.0 diff --git a/go.sum b/go.sum index bd10cba0f1..79f693e223 100644 --- a/go.sum +++ b/go.sum @@ -1134,6 +1134,8 @@ github.com/portworx/kdmp v0.4.1-0.20211116131424-efce3ec4cad4 h1:Iw1lBKH/GsJOURc github.com/portworx/kdmp v0.4.1-0.20211116131424-efce3ec4cad4/go.mod h1:QrB4B1mH0aUtQa/vvfi+xf8fTUf3oJf0dCYzM61psAg= github.com/portworx/kdmp v0.4.1-0.20211117114335-623768a845fc h1:De0pxRI33i1Rb8/bT9X5NCZQ9hy5ykqcd+YFPzgNWHE= github.com/portworx/kdmp v0.4.1-0.20211117114335-623768a845fc/go.mod h1:iPEUswzaOQ+Ox7CFqQDh2g4G+zf4FREEzmqp23MUHHQ= +github.com/portworx/kdmp v0.4.1-0.20211119130352-c09d94d769d7 h1:6iSJP1RDofxYx4eAcj15T6ByxKqflLNS2jBqlJp0now= +github.com/portworx/kdmp v0.4.1-0.20211119130352-c09d94d769d7/go.mod h1:iPEUswzaOQ+Ox7CFqQDh2g4G+zf4FREEzmqp23MUHHQ= github.com/portworx/kvdb v0.0.0-20190105022415-cccaa09abfc9/go.mod h1:Q8YyrNDvPp3DVF96BDcQuaC7fAYUCuUX+l58S7OnD2M= github.com/portworx/kvdb v0.0.0-20191223203141-f42097b1fcd8/go.mod h1:Q8YyrNDvPp3DVF96BDcQuaC7fAYUCuUX+l58S7OnD2M= github.com/portworx/kvdb v0.0.0-20200311180812-b2c72382d652 h1:NElBL34RIHlZKGtDVXT/srxuVZ1+tqmnCdm1K+MC+fU= diff --git a/vendor/github.com/portworx/kdmp/pkg/controllers/dataexport/reconcile.go b/vendor/github.com/portworx/kdmp/pkg/controllers/dataexport/reconcile.go index 45878bb0b3..989f3b473e 100644 --- a/vendor/github.com/portworx/kdmp/pkg/controllers/dataexport/reconcile.go +++ b/vendor/github.com/portworx/kdmp/pkg/controllers/dataexport/reconcile.go @@ -98,7 +98,6 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er return false, nil } dataExport := in.DeepCopy() - // set the initial stage if dataExport.Status.Stage == "" { // TODO: set defaults @@ -189,6 +188,16 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er return true, c.updateStatus(dataExport, data) } + if dataExport.Status.Status == kdmpapi.DataExportStatusFailed { + // set to the next stage + data := updateDataExportDetail{ + stage: kdmpapi.DataExportStageCleanup, + status: dataExport.Status.Status, + reason: "", + } + return false, c.updateStatus(dataExport, data) + } + // use snapshot pvc in the dst namespace if it's available srcPVCName := dataExport.Spec.Source.Name if dataExport.Status.SnapshotPVCName != "" { @@ -343,7 +352,17 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er } return false, c.updateStatus(dataExport, data) } - + // When job reaches the last retry limit, pod is removed and reconciler + // can be coming later during which is pod might be deleted. GetPods() doesn't give + // error when pods are not present hence we are not sure if pods are still coming up + // or deleted. + if progress.RestartCount < (utils.JobPodBackOffLimit - 1) { + logrus.Tracef("pod restart cnt: %v", progress.RestartCount) + data := updateDataExportDetail{ + status: kdmpapi.DataExportStatusInProgress, + } + return true, c.updateStatus(dataExport, data) + } switch progress.State { case drivers.JobStateFailed: errMsg := fmt.Sprintf("%s transfer job failed: %s", dataExport.Status.TransferID, progress.Reason) @@ -1474,42 +1493,54 @@ func waitForPVCBound(in kdmpapi.DataExportObjectReference, checkMounts bool) (*c } func checkPVCIgnoringJobMounts(in kdmpapi.DataExportObjectReference, expectedMountJob string) (*corev1.PersistentVolumeClaim, error) { - if err := checkNameNamespace(in); err != nil { - return nil, err - } - pvc, err := core.Instance().GetPersistentVolumeClaim(in.Name, in.Namespace) - if err != nil { - return nil, err - } - var sc *storagev1.StorageClass - storageClassName := k8shelper.GetPersistentVolumeClaimClass(pvc) - if storageClassName != "" { - sc, err = storage.Instance().GetStorageClass(storageClassName) - if err != nil { - return nil, err + var pvc *corev1.PersistentVolumeClaim + var checkErr error + checkTask := func() (interface{}, bool, error) { + if checkErr := checkNameNamespace(in); checkErr != nil { + return "", true, checkErr + } + pvc, checkErr = core.Instance().GetPersistentVolumeClaim(in.Name, in.Namespace) + if checkErr != nil { + return "", true, checkErr + } + var sc *storagev1.StorageClass + storageClassName := k8shelper.GetPersistentVolumeClaimClass(pvc) + if storageClassName != "" { + sc, checkErr = storage.Instance().GetStorageClass(storageClassName) + if checkErr != nil { + return "", true, checkErr + } + logrus.Debugf("checkPVCIgnoringJobMounts: pvc name %v - storage class VolumeBindingMode %v", pvc.Name, *sc.VolumeBindingMode) } - logrus.Debugf("checkPVCIgnoringJobMounts: pvc name %v - storage class VolumeBindingMode %v", pvc.Name, *sc.VolumeBindingMode) - } - if *sc.VolumeBindingMode != storagev1.VolumeBindingWaitForFirstConsumer { - // wait for pvc to get bound - pvc, err = waitForPVCBound(in, true) - if err != nil { - return nil, err + if *sc.VolumeBindingMode != storagev1.VolumeBindingWaitForFirstConsumer { + // wait for pvc to get bound + pvc, checkErr = waitForPVCBound(in, true) + if checkErr != nil { + return "", true, checkErr + } } - } - pods, err := core.Instance().GetPodsUsingPVC(pvc.Name, pvc.Namespace) - if err != nil { - return nil, fmt.Errorf("get mounted pods: %v", err) - } + pods, checkErr := core.Instance().GetPodsUsingPVC(pvc.Name, pvc.Namespace) + if checkErr != nil { + return "", true, fmt.Errorf("get mounted pods: %v", checkErr) + } - if len(pods) > 0 { - for _, pod := range pods { - if podBelongsToJob(pod, expectedMountJob, pvc.Namespace) { - return pvc, nil + if len(pods) > 0 { + for _, pod := range pods { + if podBelongsToJob(pod, expectedMountJob, pvc.Namespace) { + return "", false, nil + } } + checkErr = fmt.Errorf("mounted to %v pods", toPodNames(pods)) + return "", false, checkErr } - return nil, fmt.Errorf("mounted to %v pods", toPodNames(pods)) + return "", false, nil + } + if _, err := task.DoRetryWithTimeout(checkTask, defaultTimeout, progressCheckInterval); err != nil { + errMsg := fmt.Sprintf("max retries done, failed to check the PVC status in dataexport %v/%v: %v", in.Namespace, in.Name, checkErr) + logrus.Errorf("%v", errMsg) + // Exhausted all retries, fail the CR + return nil, fmt.Errorf("%v", errMsg) } return pvc, nil } diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/drivers.go b/vendor/github.com/portworx/kdmp/pkg/drivers/drivers.go index 78c9be5973..e405f58603 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/drivers.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/drivers.go @@ -124,6 +124,8 @@ type JobStatus struct { ProgressPercents float64 State JobState Reason string + // RestartCount holds container restart count of job pod + RestartCount int32 } // IsTransferCompleted allows to check transfer status. diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackup.go b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackup.go index 3dd43f8888..90ef830e6a 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackup.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackup.go @@ -189,7 +189,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { fn := "JobStatus" namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) @@ -198,15 +198,21 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { logrus.Errorf("%s: %v", fn, errMsg) return nil, fmt.Errorf(errMsg) } + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + logrus.Errorf("%s: %v", fn, errMsg) + return nil, fmt.Errorf(errMsg) + } jobErr, nodeErr := utils.IsJobOrNodeFailed(job) var errMsg string if jobErr { errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if nodeErr { errMsg = fmt.Sprintf("Node [%v] on which job [%v/%v] schedules is NotReady", job.Spec.Template.Spec.NodeName, namespace, name) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } vb, err := kdmpops.Instance().GetVolumeBackup(context.Background(), name, namespace) @@ -214,7 +220,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { if apierrors.IsNotFound(err) { if utils.IsJobPending(job) { logrus.Warnf("backup job %s is in pending state", job.Name) - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } } errMsg := fmt.Sprintf("failed to fetch volumebackup %s/%s status: %v", namespace, name, err) @@ -222,7 +228,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { return nil, fmt.Errorf(errMsg) } - return utils.ToJobStatus(vb.Status.ProgressPercentage, vb.Status.LastKnownError), nil + return utils.ToJobStatus(vb.Status.ProgressPercentage, vb.Status.LastKnownError, restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -279,6 +285,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, @@ -402,9 +409,14 @@ func buildJob(jobName string, jobOptions drivers.JobOpts) (*batchv1.Job, error) logrus.Errorf("%s: %v", fn, errMsg) return nil, fmt.Errorf(errMsg) } - // run a "live" backup if a pvc is mounted (mount a kubelet directory with pod volumes) if len(pods) > 0 { + logrus.Debugf("buildJob: pod %v phase %v pvc: %v/%v", pods[0].Name, pods[0].Status.Phase, jobOptions.Namespace, jobOptions.SourcePVCName) + if pods[0].Status.Phase == corev1.PodPending { + errMsg := fmt.Sprintf("pods %v is using pvc %v/%v but it is in pending state, backup is not possible", pods[0].Name, jobOptions.Namespace, jobOptions.SourcePVCName) + logrus.Errorf("%s: %v", fn, errMsg) + return nil, fmt.Errorf(errMsg) + } return jobForLiveBackup( jobOptions, jobName, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackuplive.go b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackuplive.go index c8d76a98ee..64a6b98496 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackuplive.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiabackup/kopiabackuplive.go @@ -71,6 +71,7 @@ func jobForLiveBackup( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiadelete/kopiadelete.go b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiadelete/kopiadelete.go index da45ba1990..75cb82e133 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiadelete/kopiadelete.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiadelete/kopiadelete.go @@ -121,7 +121,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { fn := "JobStatus" namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) @@ -130,14 +130,21 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { logrus.Errorf("%s: %v", fn, errMsg) return nil, fmt.Errorf(errMsg) } + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + logrus.Errorf("%s: %v", fn, errMsg) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if utils.IsJobCompleted(job) { - return utils.ToJobStatus(drivers.TransferProgressCompleted, ""), nil + return utils.ToJobStatus(drivers.TransferProgressCompleted, "", restartCount), nil } - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -177,6 +184,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiamaintenance/kopiamaintenance.go b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiamaintenance/kopiamaintenance.go index 98bac54969..7bd2a26c44 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiamaintenance/kopiamaintenance.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiamaintenance/kopiamaintenance.go @@ -94,7 +94,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { fn := "JobStatus" namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) @@ -103,14 +103,22 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { logrus.Errorf("%s: %v", fn, errMsg) return nil, fmt.Errorf(errMsg) } + + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + logrus.Errorf("%s: %v", fn, errMsg) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check maintenance [%s/%s] job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if utils.IsJobCompleted(job) { - return utils.ToJobStatus(drivers.TransferProgressCompleted, ""), nil + return utils.ToJobStatus(drivers.TransferProgressCompleted, "", restartCount), nil } - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -176,6 +184,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiarestore/kopiarestore.go b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiarestore/kopiarestore.go index f9bfa356df..aa5eb01d9c 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/kopiarestore/kopiarestore.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/kopiarestore/kopiarestore.go @@ -102,30 +102,38 @@ func (d Driver) DeleteJob(id string) error { // JobStatus returns a progress status for a data transfer. func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { + fn := "JobStatus:" namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) if err != nil { return nil, err } + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + logrus.Errorf("%s: %v", fn, errMsg) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if utils.IsJobPending(job) { logrus.Warnf("restore job %s is in pending state", job.Name) - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } if !utils.IsJobCompleted(job) { // TODO: update progress - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } - return utils.ToJobStatus(drivers.TransferProgressCompleted, ""), nil + return utils.ToJobStatus(drivers.TransferProgressCompleted, "", restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -188,6 +196,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackup.go b/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackup.go index a5808c51e8..2a00086ea8 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackup.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackup.go @@ -92,16 +92,22 @@ func (d Driver) DeleteJob(id string) error { func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) if err != nil { return nil, err } + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } // restic executor updates a volumebackup object with a progress details @@ -110,7 +116,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { return nil, err } - return utils.ToJobStatus(vb.Status.ProgressPercentage, vb.Status.LastKnownError), nil + return utils.ToJobStatus(vb.Status.ProgressPercentage, vb.Status.LastKnownError, restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -159,6 +165,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackuplive.go b/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackuplive.go index b206719100..fc0f12e771 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackuplive.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/resticbackup/resticbackuplive.go @@ -65,6 +65,7 @@ func jobForLiveBackup( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/resticrestore/resticrestore.go b/vendor/github.com/portworx/kdmp/pkg/drivers/resticrestore/resticrestore.go index 2985752049..42e1c9a2a7 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/resticrestore/resticrestore.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/resticrestore/resticrestore.go @@ -103,23 +103,30 @@ func (d Driver) DeleteJob(id string) error { func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) if err != nil { return nil, err } + + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to rget estart count for job %s/%s job: %v", namespace, name, err) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if !utils.IsJobCompleted(job) { // TODO: update progress - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } - return utils.ToJobStatus(drivers.TransferProgressCompleted, ""), nil + return utils.ToJobStatus(drivers.TransferProgressCompleted, "", restartCount), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -174,6 +181,7 @@ func jobFor( Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/rsync/rsync.go b/vendor/github.com/portworx/kdmp/pkg/drivers/rsync/rsync.go index eff27d7820..4f9891fb2c 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/rsync/rsync.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/rsync/rsync.go @@ -70,24 +70,31 @@ func (d Driver) DeleteJob(id string) error { func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { namespace, name, err := utils.ParseJobID(id) if err != nil { - return utils.ToJobStatus(0, err.Error()), nil + return utils.ToJobStatus(0, err.Error(), 0), nil } job, err := batch.Instance().GetJob(name, namespace) if err != nil { return nil, err } + + restartCount, err := utils.FetchJobContainerRestartCount(job) + if err != nil { + errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) + return nil, fmt.Errorf(errMsg) + } + if utils.IsJobFailed(job) { errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg), nil + return utils.ToJobStatus(0, errMsg, restartCount), nil } if !utils.IsJobCompleted(job) { // TODO: update progress - return utils.ToJobStatus(0, ""), nil + return utils.ToJobStatus(0, "", restartCount), nil } - return utils.ToJobStatus(drivers.TransferProgressCompleted, ""), nil + return utils.ToJobStatus(drivers.TransferProgressCompleted, "", 0), nil } func (d Driver) validate(o drivers.JobOpts) error { @@ -123,6 +130,7 @@ func jobFor(srcVol, dstVol, namespace string, labels map[string]string) (*batchv Labels: labels, }, Spec: batchv1.JobSpec{ + BackoffLimit: &utils.JobPodBackOffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/utils/common.go b/vendor/github.com/portworx/kdmp/pkg/drivers/utils/common.go index 721737d0e8..9c3fbf24c4 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/utils/common.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/utils/common.go @@ -37,6 +37,11 @@ const ( DefaultCompresion = "s2-parallel-8" ) +var ( + // JobPodBackOffLimit backofflimit for the job + JobPodBackOffLimit = int32(4) +) + // SetupServiceAccount create a service account and bind it to a provided role. func SetupServiceAccount(name, namespace string, role *rbacv1.Role) error { if role != nil { diff --git a/vendor/github.com/portworx/kdmp/pkg/drivers/utils/utils.go b/vendor/github.com/portworx/kdmp/pkg/drivers/utils/utils.go index 460572221b..dc90abe271 100644 --- a/vendor/github.com/portworx/kdmp/pkg/drivers/utils/utils.go +++ b/vendor/github.com/portworx/kdmp/pkg/drivers/utils/utils.go @@ -113,12 +113,35 @@ func IsJobPending(j *batchv1.Job) bool { return false } +// FetchJobContainerRestartCount fetches job pod restart count +func FetchJobContainerRestartCount(j *batchv1.Job) (int32, error) { + // Check if the pod is in running state + pods, err := core.Instance().GetPods( + j.Namespace, + map[string]string{ + "job-name": j.Name, + }, + ) + if err != nil { + // Cannot determine job state + return 0, fmt.Errorf("cannot determine job state") + } else if len(pods.Items) == 0 { + return 0, nil + } + if len(pods.Items[0].Status.ContainerStatuses) == 0 { + return 0, nil + } + + return (pods.Items[0].Status.ContainerStatuses[0].RestartCount), nil +} + // ToJobStatus returns a job status for provided parameters. -func ToJobStatus(progress float64, errMsg string) *drivers.JobStatus { +func ToJobStatus(progress float64, errMsg string, retartCount int32) *drivers.JobStatus { if len(errMsg) > 0 { return &drivers.JobStatus{ - State: drivers.JobStateFailed, - Reason: errMsg, + State: drivers.JobStateFailed, + Reason: errMsg, + RestartCount: retartCount, } } @@ -126,12 +149,14 @@ func ToJobStatus(progress float64, errMsg string) *drivers.JobStatus { return &drivers.JobStatus{ State: drivers.JobStateCompleted, ProgressPercents: progress, + RestartCount: retartCount, } } return &drivers.JobStatus{ State: drivers.JobStateInProgress, ProgressPercents: progress, + RestartCount: retartCount, } } diff --git a/vendor/modules.txt b/vendor/modules.txt index 8973ceb12f..a7787d8061 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -425,7 +425,7 @@ github.com/pierrec/lz4/internal/xxh32 github.com/pkg/errors # github.com/pmezard/go-difflib v1.0.0 github.com/pmezard/go-difflib/difflib -# github.com/portworx/kdmp v0.4.1-0.20211117114335-623768a845fc +# github.com/portworx/kdmp v0.4.1-0.20211119130352-c09d94d769d7 ## explicit github.com/portworx/kdmp/pkg/apis/kdmp github.com/portworx/kdmp/pkg/apis/kdmp/v1alpha1