Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove createSnapshotContentRetryCount and createSnapshotContentInterval #211

Merged
merged 4 commits into from
Dec 20, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions cmd/snapshot-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,9 @@ const (

// Command line flags
var (
kubeconfig = flag.String("kubeconfig", "", "Absolute path to the kubeconfig file. Required only when running out of cluster.")
createSnapshotContentRetryCount = flag.Int("create-snapshotcontent-retrycount", 5, "Number of retries when we create a snapshot content object for a snapshot.")
createSnapshotContentInterval = flag.Duration("create-snapshotcontent-interval", 10*time.Second, "Interval between retries when we create a snapshot content object for a snapshot.")
resyncPeriod = flag.Duration("resync-period", 60*time.Second, "Resync interval of the controller.")
showVersion = flag.Bool("version", false, "Show version.")
kubeconfig = flag.String("kubeconfig", "", "Absolute path to the kubeconfig file. Required only when running out of cluster.")
resyncPeriod = flag.Duration("resync-period", 60*time.Second, "Resync interval of the controller.")
showVersion = flag.Bool("version", false, "Show version.")

leaderElection = flag.Bool("leader-election", false, "Enables leader election.")
leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.")
Expand Down Expand Up @@ -96,7 +94,7 @@ func main() {
// Add Snapshot types to the defualt Kubernetes so events can be logged for them
snapshotscheme.AddToScheme(scheme.Scheme)

klog.V(2).Infof("Start NewCSISnapshotController with kubeconfig [%s] createSnapshotContentRetryCount [%d] createSnapshotContentInterval [%d] resyncPeriod [%+v]", *kubeconfig, *createSnapshotContentRetryCount, *createSnapshotContentInterval, *resyncPeriod)
klog.V(2).Infof("Start NewCSISnapshotController with kubeconfig [%s] resyncPeriod [%+v]", *kubeconfig, *resyncPeriod)

ctrl := controller.NewCSISnapshotCommonController(
snapClient,
Expand All @@ -105,8 +103,6 @@ func main() {
factory.Snapshot().V1beta1().VolumeSnapshotContents(),
factory.Snapshot().V1beta1().VolumeSnapshotClasses(),
coreFactory.Core().V1().PersistentVolumeClaims(),
*createSnapshotContentRetryCount,
*createSnapshotContentInterval,
*resyncPeriod,
)

Expand Down
2 changes: 0 additions & 2 deletions pkg/common-controller/framework_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,6 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
informerFactory.Snapshot().V1beta1().VolumeSnapshotContents(),
informerFactory.Snapshot().V1beta1().VolumeSnapshotClasses(),
coreFactory.Core().V1().PersistentVolumeClaims(),
3,
5*time.Millisecond,
60*time.Second,
)

Expand Down
63 changes: 21 additions & 42 deletions pkg/common-controller/snapshot_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol
}

// update snapshot status
for i := 0; i < ctrl.createSnapshotContentRetryCount; i++ {
klog.V(5).Infof("syncUnreadySnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshot))
_, err = ctrl.updateSnapshotStatus(snapshot, newContent)
if err == nil {
break
}
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
}

klog.V(5).Infof("syncUnreadySnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshot))
_, err = ctrl.updateSnapshotStatus(snapshot, newContent)
if err != nil {
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
// update snapshot status failed
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err))
return err
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the caller, I see that we don't requeue the object and instead rely on resync period to retry. I think we should change the behavior to requeue, but we should also make sure the workqueue has proper throttling/backoff as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use queue.AddRateLimited(key)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you take a look of this? 10017c2

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I like this direction. Also cc @yuxiangqian and @jsafrane for more eyes. This will cause snapshotter to potentially retry more frequently than it has in the past, although it should be rate limited.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should delay this change. This changes snapshot controller behavior a lot. In the past we don't do retries when create snapshot fails because snapshot is time sensitive (other than a few retries when creating/updating API objects).

Can we revisit this after the 2.0 release? We have a pending fix for the create snapshot timeout issue as well that we should go back to after 2.0.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little concerned about removing retrying logic for status update and snapshot content creation from application perspective. Do we have a rough idea how often retry would resolve API server failures? Application could end up based of the existence of status field of a snapshot to decide whether or not to unquiesce. With RateLimited requeue, the waiting time could be significantly longer?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed snapshotQueue.AddRateLimited().

What is conclusion here? Should I just add back createSnapshotContentRetryCount() as a internal const or is the current PR okay? @msau42

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like having retries within retries. It makes timing hard to reason about (metrics look strange), and we can end up overloading the API server since there's two different throttling behaviors that need to be controlled. We previously had issues in external-provisioner due to it having multiple retries within retries and I'd like to avoid that here too. We can tune the rate limiting as necessary.

But I agree this is a large, fundamental change to how the snapshot controller has operated, so I'm ok deferring it to the next release.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, let's add the retry count back for now so the only change is not making the retry count configurable.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.

Expand Down Expand Up @@ -399,16 +393,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol
}

// Update snapshot status with BoundVolumeSnapshotContentName
for i := 0; i < ctrl.createSnapshotContentRetryCount; i++ {
klog.V(5).Infof("syncUnreadySnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshot))
_, err = ctrl.updateSnapshotStatus(snapshot, content)
if err == nil {
break
}
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
}

klog.V(5).Infof("syncUnreadySnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshot))
_, err = ctrl.updateSnapshotStatus(snapshot, content)
if err != nil {
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
// update snapshot status failed
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err))
return err
Expand Down Expand Up @@ -501,23 +489,17 @@ func (ctrl *csiSnapshotCommonController) createSnapshotContent(snapshot *crdv1.V

var updateContent *crdv1.VolumeSnapshotContent
klog.V(3).Infof("volume snapshot content %#v", snapshotContent)
// Try to create the VolumeSnapshotContent object several times
for i := 0; i < ctrl.createSnapshotContentRetryCount; i++ {
klog.V(5).Infof("createSnapshotContent [%s]: trying to save volume snapshot content %s", utils.SnapshotKey(snapshot), snapshotContent.Name)
if updateContent, err = ctrl.clientset.SnapshotV1beta1().VolumeSnapshotContents().Create(snapshotContent); err == nil || apierrs.IsAlreadyExists(err) {
// Save succeeded.
if err != nil {
klog.V(3).Infof("volume snapshot content %q for snapshot %q already exists, reusing", snapshotContent.Name, utils.SnapshotKey(snapshot))
err = nil
updateContent = snapshotContent
} else {
klog.V(3).Infof("volume snapshot content %q for snapshot %q saved, %v", snapshotContent.Name, utils.SnapshotKey(snapshot), snapshotContent)
}
break
// Try to create the VolumeSnapshotContent object
klog.V(5).Infof("createSnapshotContent [%s]: trying to save volume snapshot content %s", utils.SnapshotKey(snapshot), snapshotContent.Name)
if updateContent, err = ctrl.clientset.SnapshotV1beta1().VolumeSnapshotContents().Create(snapshotContent); err == nil || apierrs.IsAlreadyExists(err) {
// Save succeeded.
if err != nil {
klog.V(3).Infof("volume snapshot content %q for snapshot %q already exists, reusing", snapshotContent.Name, utils.SnapshotKey(snapshot))
err = nil
updateContent = snapshotContent
} else {
klog.V(3).Infof("volume snapshot content %q for snapshot %q saved, %v", snapshotContent.Name, utils.SnapshotKey(snapshot), snapshotContent)
}
// Save failed, try again after a while.
klog.V(3).Infof("failed to save volume snapshot content %q for snapshot %q: %v", snapshotContent.Name, utils.SnapshotKey(snapshot), err)
time.Sleep(ctrl.createSnapshotContentInterval)
}

if err != nil {
Expand Down Expand Up @@ -867,17 +849,14 @@ func (ctrl *csiSnapshotCommonController) bindandUpdateVolumeSnapshot(snapshotCon
snapshotCopy := snapshotObj.DeepCopy()

// update snapshot status
for i := 0; i < ctrl.createSnapshotContentRetryCount; i++ {
klog.V(5).Infof("bindandUpdateVolumeSnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshotCopy))
updateSnapshot, err := ctrl.updateSnapshotStatus(snapshotCopy, snapshotContent)
if err == nil {
snapshotCopy = updateSnapshot
break
}
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
klog.V(5).Infof("bindandUpdateVolumeSnapshot [%s]: trying to update snapshot status", utils.SnapshotKey(snapshotCopy))
updateSnapshot, err := ctrl.updateSnapshotStatus(snapshotCopy, snapshotContent)
if err == nil {
snapshotCopy = updateSnapshot
}

if err != nil {
klog.V(4).Infof("failed to update snapshot %s status: %v", utils.SnapshotKey(snapshot), err)
// update snapshot status failed
ctrl.updateSnapshotErrorStatusWithEvent(snapshotCopy, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err))
return nil, err
Expand Down
26 changes: 10 additions & 16 deletions pkg/common-controller/snapshot_controller_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ type csiSnapshotCommonController struct {
// Map of scheduled/running operations.
runningOperations goroutinemap.GoRoutineMap

createSnapshotContentRetryCount int
createSnapshotContentInterval time.Duration
resyncPeriod time.Duration
resyncPeriod time.Duration
}

// NewCSISnapshotController returns a new *csiSnapshotCommonController
Expand All @@ -77,8 +75,6 @@ func NewCSISnapshotCommonController(
volumeSnapshotContentInformer storageinformers.VolumeSnapshotContentInformer,
volumeSnapshotClassInformer storageinformers.VolumeSnapshotClassInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
createSnapshotContentRetryCount int,
createSnapshotContentInterval time.Duration,
resyncPeriod time.Duration,
) *csiSnapshotCommonController {
broadcaster := record.NewBroadcaster()
Expand All @@ -88,17 +84,15 @@ func NewCSISnapshotCommonController(
eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: fmt.Sprintf("snapshot-controller")})

ctrl := &csiSnapshotCommonController{
clientset: clientset,
client: client,
eventRecorder: eventRecorder,
runningOperations: goroutinemap.NewGoRoutineMap(true),
createSnapshotContentRetryCount: createSnapshotContentRetryCount,
createSnapshotContentInterval: createSnapshotContentInterval,
resyncPeriod: resyncPeriod,
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
clientset: clientset,
client: client,
eventRecorder: eventRecorder,
runningOperations: goroutinemap.NewGoRoutineMap(true),
resyncPeriod: resyncPeriod,
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
}

ctrl.pvcLister = pvcInformer.Lister()
Expand Down