Skip to content

Commit

Permalink
Perf improvements for existing resource restore
Browse files Browse the repository at this point in the history
Use informer cache with dynamic client for Get calls on restore
When enabled, also make the Get call before create.

Add server and install parameter to allow disabling this feature,
but enable by default

Signed-off-by: Scott Seago <[email protected]>
  • Loading branch information
sseago committed Oct 11, 2023
1 parent b316101 commit 7750e12
Show file tree
Hide file tree
Showing 14 changed files with 243 additions and 34 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/6723-sseago
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Perf improvements for existing resource restore
8 changes: 8 additions & 0 deletions pkg/client/dynamic.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@ package client

import (
"context"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/dynamic/dynamicinformer"
)

// DynamicFactory contains methods for retrieving dynamic clients for GroupVersionResources and
Expand All @@ -33,6 +35,8 @@ type DynamicFactory interface {
// ClientForGroupVersionResource returns a Dynamic client for the given group/version
// and resource for the given namespace.
ClientForGroupVersionResource(gv schema.GroupVersion, resource metav1.APIResource, namespace string) (Dynamic, error)
// DynamicSharedInformerFactoryForNamespace returns a DynamicSharedInformerFactory for the given namespace.
DynamicSharedInformerFactoryForNamespace(namespace string) dynamicinformer.DynamicSharedInformerFactory
}

// dynamicFactory implements DynamicFactory.
Expand All @@ -51,6 +55,10 @@ func (f *dynamicFactory) ClientForGroupVersionResource(gv schema.GroupVersion, r
}, nil
}

func (f *dynamicFactory) DynamicSharedInformerFactoryForNamespace(namespace string) dynamicinformer.DynamicSharedInformerFactory {
return dynamicinformer.NewFilteredDynamicSharedInformerFactory(f.dynamicClient, time.Minute, namespace, nil)
}

// Creator creates an object.
type Creator interface {
// Create creates an object.
Expand Down
4 changes: 4 additions & 0 deletions pkg/cmd/cli/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ type Options struct {
DefaultVolumesToFsBackup bool
UploaderType string
DefaultSnapshotMoveData bool
DisableInformerCache bool
}

// BindFlags adds command line values to the options struct.
Expand Down Expand Up @@ -122,6 +123,7 @@ func (o *Options) BindFlags(flags *pflag.FlagSet) {
flags.BoolVar(&o.DefaultVolumesToFsBackup, "default-volumes-to-fs-backup", o.DefaultVolumesToFsBackup, "Bool flag to configure Velero server to use pod volume file system backup by default for all volumes on all backups. Optional.")
flags.StringVar(&o.UploaderType, "uploader-type", o.UploaderType, fmt.Sprintf("The type of uploader to transfer the data of pod volumes, the supported values are '%s', '%s'", uploader.ResticType, uploader.KopiaType))
flags.BoolVar(&o.DefaultSnapshotMoveData, "default-snapshot-move-data", o.DefaultSnapshotMoveData, "Bool flag to configure Velero server to move data by default for all snapshots supporting data movement. Optional.")
flags.BoolVar(&o.DisableInformerCache, "disable-informer-cache", o.DisableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable). Optional.")
}

// NewInstallOptions instantiates a new, default InstallOptions struct.
Expand Down Expand Up @@ -149,6 +151,7 @@ func NewInstallOptions() *Options {
DefaultVolumesToFsBackup: false,
UploaderType: uploader.KopiaType,
DefaultSnapshotMoveData: false,
DisableInformerCache: true,
}
}

Expand Down Expand Up @@ -213,6 +216,7 @@ func (o *Options) AsVeleroOptions() (*install.VeleroOptions, error) {
DefaultVolumesToFsBackup: o.DefaultVolumesToFsBackup,
UploaderType: o.UploaderType,
DefaultSnapshotMoveData: o.DefaultSnapshotMoveData,
DisableInformerCache: o.DisableInformerCache,
}, nil
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ const (
defaultCredentialsDirectory = "/tmp/credentials"

defaultMaxConcurrentK8SConnections = 30
defaultDisableInformerCache = false
)

type serverConfig struct {
Expand All @@ -136,6 +137,7 @@ type serverConfig struct {
uploaderType string
maxConcurrentK8SConnections int
defaultSnapshotMoveData bool
disableInformerCache bool
}

func NewCommand(f client.Factory) *cobra.Command {
Expand Down Expand Up @@ -165,6 +167,7 @@ func NewCommand(f client.Factory) *cobra.Command {
uploaderType: uploader.ResticType,
maxConcurrentK8SConnections: defaultMaxConcurrentK8SConnections,
defaultSnapshotMoveData: false,
disableInformerCache: defaultDisableInformerCache,
}
)

Expand Down Expand Up @@ -236,6 +239,7 @@ func NewCommand(f client.Factory) *cobra.Command {
command.Flags().DurationVar(&config.resourceTimeout, "resource-timeout", config.resourceTimeout, "How long to wait for resource processes which are not covered by other specific timeout parameters. Default is 10 minutes.")
command.Flags().IntVar(&config.maxConcurrentK8SConnections, "max-concurrent-k8s-connections", config.maxConcurrentK8SConnections, "Max concurrent connections number that Velero can create with kube-apiserver. Default is 30.")
command.Flags().BoolVar(&config.defaultSnapshotMoveData, "default-snapshot-move-data", config.defaultSnapshotMoveData, "Move data by default for all snapshots supporting data movement.")
command.Flags().BoolVar(&config.disableInformerCache, "disable-informer-cache", config.disableInformerCache, "Disable informer cache for Get calls on restore. WIth this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable).")

return command
}
Expand Down Expand Up @@ -936,6 +940,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.metrics,
s.config.formatFlag.Parse(),
s.config.defaultItemOperationTimeout,
s.config.disableInformerCache,
)

if err = r.SetupWithManager(s.mgr); err != nil {
Expand Down
18 changes: 11 additions & 7 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ type restoreReconciler struct {
logFormat logging.Format
clock clock.WithTickerAndDelayedExecution
defaultItemOperationTimeout time.Duration
disableInformerCache bool

newPluginManager func(logger logrus.FieldLogger) clientmgmt.Manager
backupStoreGetter persistence.ObjectBackupStoreGetter
Expand All @@ -123,6 +124,7 @@ func NewRestoreReconciler(
metrics *metrics.ServerMetrics,
logFormat logging.Format,
defaultItemOperationTimeout time.Duration,
disableInformerCache bool,
) *restoreReconciler {
r := &restoreReconciler{
ctx: ctx,
Expand All @@ -135,6 +137,7 @@ func NewRestoreReconciler(
logFormat: logFormat,
clock: &clock.RealClock{},
defaultItemOperationTimeout: defaultItemOperationTimeout,
disableInformerCache: disableInformerCache,

// use variables to refer to these functions so they can be
// replaced with fakes for testing.
Expand Down Expand Up @@ -519,13 +522,14 @@ func (r *restoreReconciler) runValidatedRestore(restore *api.Restore, info backu
}

restoreReq := &pkgrestore.Request{
Log: restoreLog,
Restore: restore,
Backup: info.backup,
PodVolumeBackups: podVolumeBackups,
VolumeSnapshots: volumeSnapshots,
BackupReader: backupFile,
ResourceModifiers: resourceModifiers,
Log: restoreLog,
Restore: restore,
Backup: info.backup,
PodVolumeBackups: podVolumeBackups,
VolumeSnapshots: volumeSnapshots,
BackupReader: backupFile,
ResourceModifiers: resourceModifiers,
DisableInformerCache: r.disableInformerCache,
}
restoreWarnings, restoreErrors := r.restorer.RestoreWithResolvers(restoreReq, actionsResolver, pluginManager)

Expand Down
5 changes: 5 additions & 0 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ func TestFetchBackupInfo(t *testing.T) {
metrics.NewServerMetrics(),
formatFlag,
60*time.Minute,
false,
)

if test.backupStoreError == nil {
Expand Down Expand Up @@ -191,6 +192,7 @@ func TestProcessQueueItemSkips(t *testing.T) {
metrics.NewServerMetrics(),
formatFlag,
60*time.Minute,
false,
)

_, err := r.Reconcile(context.Background(), ctrl.Request{NamespacedName: types.NamespacedName{
Expand Down Expand Up @@ -445,6 +447,7 @@ func TestRestoreReconcile(t *testing.T) {
metrics.NewServerMetrics(),
formatFlag,
60*time.Minute,
false,
)

r.clock = clocktesting.NewFakeClock(now)
Expand Down Expand Up @@ -616,6 +619,7 @@ func TestValidateAndCompleteWhenScheduleNameSpecified(t *testing.T) {
metrics.NewServerMetrics(),
formatFlag,
60*time.Minute,
false,
)

restore := &velerov1api.Restore{
Expand Down Expand Up @@ -708,6 +712,7 @@ func TestValidateAndCompleteWithResourceModifierSpecified(t *testing.T) {
metrics.NewServerMetrics(),
formatFlag,
60*time.Minute,
false,
)

restore := &velerov1api.Restore{
Expand Down
11 changes: 11 additions & 0 deletions pkg/install/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type podTemplateConfig struct {
uploaderType string
defaultSnapshotMoveData bool
privilegedNodeAgent bool
disableInformerCache bool
}

func WithImage(image string) podTemplateOption {
Expand Down Expand Up @@ -144,6 +145,12 @@ func WithDefaultSnapshotMoveData() podTemplateOption {
}
}

func WithDisableInformerCache() podTemplateOption {
return func(c *podTemplateConfig) {
c.disableInformerCache = true
}
}

func WithServiceAccountName(sa string) podTemplateOption {
return func(c *podTemplateConfig) {
c.serviceAccountName = sa
Expand Down Expand Up @@ -185,6 +192,10 @@ func Deployment(namespace string, opts ...podTemplateOption) *appsv1.Deployment
args = append(args, "--default-snapshot-move-data=true")
}

if c.disableInformerCache {
args = append(args, "--disable-informer-cache=true")
}

if len(c.uploaderType) > 0 {
args = append(args, fmt.Sprintf("--uploader-type=%s", c.uploaderType))
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/install/deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,8 @@ func TestDeployment(t *testing.T) {

deploy = Deployment("velero", WithServiceAccountName("test-sa"))
assert.Equal(t, "test-sa", deploy.Spec.Template.Spec.ServiceAccountName)

deploy = Deployment("velero", WithDisableInformerCache())
assert.Len(t, deploy.Spec.Template.Spec.Containers[0].Args, 2)
assert.Equal(t, "--disable-informer-cache=true", deploy.Spec.Template.Spec.Containers[0].Args[1])
}
5 changes: 5 additions & 0 deletions pkg/install/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ type VeleroOptions struct {
DefaultVolumesToFsBackup bool
UploaderType string
DefaultSnapshotMoveData bool
DisableInformerCache bool
}

func AllCRDs() *unstructured.UnstructuredList {
Expand Down Expand Up @@ -357,6 +358,10 @@ func AllResources(o *VeleroOptions) *unstructured.UnstructuredList {
deployOpts = append(deployOpts, WithDefaultSnapshotMoveData())
}

if o.DisableInformerCache {
deployOpts = append(deployOpts, WithDisableInformerCache())
}

deploy := Deployment(o.Namespace, deployOpts...)

if err := appendUnstructured(resources, deploy); err != nil {
Expand Down
17 changes: 9 additions & 8 deletions pkg/restore/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,15 @@ func resourceKey(obj runtime.Object) string {
type Request struct {
*velerov1api.Restore

Log logrus.FieldLogger
Backup *velerov1api.Backup
PodVolumeBackups []*velerov1api.PodVolumeBackup
VolumeSnapshots []*volume.Snapshot
BackupReader io.Reader
RestoredItems map[itemKey]restoredItemStatus
itemOperationsList *[]*itemoperation.RestoreOperation
ResourceModifiers *resourcemodifiers.ResourceModifiers
Log logrus.FieldLogger
Backup *velerov1api.Backup
PodVolumeBackups []*velerov1api.PodVolumeBackup
VolumeSnapshots []*volume.Snapshot
BackupReader io.Reader
RestoredItems map[itemKey]restoredItemStatus
itemOperationsList *[]*itemoperation.RestoreOperation
ResourceModifiers *resourcemodifiers.ResourceModifiers
DisableInformerCache bool
}

type restoredItemStatus struct {
Expand Down
Loading

0 comments on commit 7750e12

Please sign in to comment.