Skip to content

Commit

Permalink
add diagnostic for data mover exposer
Browse files Browse the repository at this point in the history
Signed-off-by: Lyndon-Li <[email protected]>
  • Loading branch information
Lyndon-Li committed Dec 10, 2024
1 parent b607259 commit 34e417b
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 59 deletions.
2 changes: 1 addition & 1 deletion changelogs/unreleased/8482-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Fix issue #8125, add diagnostic info for data mover exposers when expose timeout
Fix issue #8125, log diagnostic info for data mover exposers when expose timeout
6 changes: 5 additions & 1 deletion pkg/controller/data_download_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"

"github.com/pkg/errors"
Expand Down Expand Up @@ -677,7 +678,10 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
return
}

log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)))
diags := strings.Split(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DD expose]%s", diag)
}

r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))

Expand Down
6 changes: 5 additions & 1 deletion pkg/controller/data_upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"

snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1"
Expand Down Expand Up @@ -755,7 +756,10 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov
volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot
}

log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du)))
diags := strings.Split(ep.DiagnoseExpose(ctx, getOwnerObject(du)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DU expose]%s", diag)
}

ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace)

Expand Down
9 changes: 6 additions & 3 deletions pkg/exposer/csi_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,20 +313,23 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor
backupPVCName := ownerObject.Name
backupVSName := ownerObject.Name

diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag := "begin diagnose CSI exposer\n"

pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{})
if err != nil {
pod = nil
diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err)
}

pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{})
if err != nil {
pvc = nil
diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err)
}

vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{})
if err != nil {
vs = nil
diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err)
}

Expand All @@ -335,7 +338,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor

if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err)
}
}
}
Expand Down Expand Up @@ -364,7 +367,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor
}
}

diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag += "end diagnose CSI exposer"

return diag
}
Expand Down
47 changes: 19 additions & 28 deletions pkg/exposer/csi_snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1166,12 +1166,11 @@ func Test_csiSnapshotExposer_DiagnoseExpose(t *testing.T) {
{
name: "no pod, pvc, vs",
ownerBackup: backup,
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
error getting backup pod fake-backup, err: pods "fake-backup" not found
error getting backup pvc fake-backup, err: persistentvolumeclaims "fake-backup" not found
error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.io "fake-backup" not found
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without status",
Expand All @@ -1183,13 +1182,12 @@ error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.i
snapshotClientObj: []runtime.Object{
&backupVSWithoutStatus,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without VSC",
Expand All @@ -1201,13 +1199,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod with node name, no node agent",
Expand All @@ -1219,14 +1216,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
node-agent is not running in node fake-node
node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod with node name, node agent is running",
Expand All @@ -1239,13 +1235,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pvc with volume name, no pv",
Expand All @@ -1258,14 +1253,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
error getting backup pv fake-pv, err: persistentvolumes "fake-pv" not found
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pvc with volume name, pv exists",
Expand All @@ -1279,14 +1273,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "vs with vsc, vsc doesn't exist",
Expand All @@ -1300,15 +1293,14 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message
error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage.k8s.io "fake-vsc" not found
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "vs with vsc, vsc exists",
Expand All @@ -1323,15 +1315,14 @@ error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage.
&backupVSWithVSC,
&backupVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message
VSC fake-vsc, readyToUse false, errMessage fake-vsc-message, handle
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
}
for _, tt := range tests {
Expand Down
8 changes: 5 additions & 3 deletions pkg/exposer/generic_restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,15 +204,17 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject
restorePodName := ownerObject.Name
restorePVCName := ownerObject.Name

diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag := "begin diagnose restore exposer\n"

pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{})
if err != nil {
pod = nil
diag += fmt.Sprintf("error getting restore pod %s, err: %v\n", restorePodName, err)
}

pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{})
if err != nil {
pvc = nil
diag += fmt.Sprintf("error getting restore pvc %s, err: %v\n", restorePVCName, err)
}

Expand All @@ -221,7 +223,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject

if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err)
}
}
}
Expand All @@ -238,7 +240,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject
}
}

diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag += "end diagnose restore exposer"

return diag
}
Expand Down
37 changes: 15 additions & 22 deletions pkg/exposer/generic_restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -646,11 +646,10 @@ func Test_ReastoreDiagnoseExpose(t *testing.T) {
{
name: "no pod, pvc",
ownerRestore: restore,
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
error getting restore pod fake-restore, err: pods "fake-restore" not found
error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restore" not found
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without status",
Expand All @@ -659,12 +658,11 @@ error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restor
&restorePodWithoutNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod without node name, pvc without volume name",
Expand All @@ -673,12 +671,11 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePodWithoutNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod with node name, no node agent",
Expand All @@ -687,13 +684,12 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePodWithNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
node-agent is not running in node fake-node
node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod with node name, node agent is running",
Expand All @@ -703,12 +699,11 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePVCWithoutVolumeName,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pvc with volume name, no pv",
Expand All @@ -718,13 +713,12 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePVCWithVolumeName,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to fake-pv
error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pvc with volume name, pv exists",
Expand All @@ -735,13 +729,12 @@ error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found
&restorePV,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
}
for _, test := range tests {
Expand Down

0 comments on commit 34e417b

Please sign in to comment.