Skip to content

Commit

Permalink
Add Support for Restoring Specific Backups
Browse files Browse the repository at this point in the history
By default Vitess will only make practical use of the latest
backup of a given shard. While this makes perfect sense for
the common use cases there are times where you need to restore
a specific backup. For example:
  1. In order to extract a portion of the data
that can then be merged with the current state. For example if
you later realize that you accidentally deleted some records
in a table that you shouldn't have last week, and you need to
perform a restore so that you can copy those specific records
back to the live data set.
  2. To perform validation, forensics, analysis on the system
state at that time.
  3. A specific PITR for whatever reason ...

This is a continuation of: #7998

This solves: #4905

Co-authored-by: Guido Iaquinti <[email protected]>
Signed-off-by: Matt Lord <[email protected]>
  • Loading branch information
mattlord and Guido Iaquinti committed Sep 15, 2021
1 parent ec0117d commit 50938fd
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 16 deletions.
4 changes: 2 additions & 2 deletions go/vt/mysqlctl/backupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ func FindBackupToRestore(ctx context.Context, params RestoreParams, bhs []backup
continue
}
}
if !checkBackupTime /* not snapshot */ || backupTime.Equal(params.StartTime) || backupTime.Before(params.StartTime) {
params.Logger.Infof("Restore: found backup %v %v to restore", bh.Directory(), bh.Name())
if !checkBackupTime || backupTime.Equal(params.StartTime) || backupTime.Before(params.StartTime) {
params.Logger.Infof("Restore: found backup %v %v to restore using a timestamp of %v", bh.Directory(), bh.Name(), params.StartTime.Format(BackupTimestampFormat))
break
}
}
Expand Down
23 changes: 18 additions & 5 deletions go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (

var (
restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there")
restoreFromBackupTs = flag.String("restore_from_backup_ts", "", "(init restore parameter) if set, restore the last backup taken at or before this timestamp. Example: '2021-04-29.133050'")
restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once")
waitForBackupInterval = flag.Duration("wait_for_backup_interval", 0, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear")

Expand All @@ -65,7 +66,7 @@ var (
// It will either work, fail gracefully, or return
// an error in case of a non-recoverable error.
// It takes the action lock so no RPC interferes.
func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error {
func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, restoreFromBackupTs string) error {
if err := tm.lock(ctx); err != nil {
return err
}
Expand Down Expand Up @@ -119,7 +120,7 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger,

startTime = time.Now()

err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore)
err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, restoreFromBackupTs)
if err != nil {
return err
}
Expand All @@ -137,7 +138,7 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger,
return nil
}

func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error {
func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, restoreFromBackupTs string) error {

tablet := tm.Tablet()
originalType := tablet.Type
Expand All @@ -152,14 +153,26 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
if err != nil {
return err
}

// Check if we need should use the latest (default) or a specified backup timestamp for the restore
var startTime time.Time

if restoreFromBackupTs != "" {
startTime, err = time.Parse(mysqlctl.BackupTimestampFormat, restoreFromBackupTs)
if err != nil {
return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("unable to parse the timestamp specified for -restore_from_backup_ts of %s: %v", restoreFromBackupTs, err))
}
}

// For a SNAPSHOT keyspace, we have to look for backups of BaseKeyspace
// so we will pass the BaseKeyspace in RestoreParams instead of tablet.Keyspace
if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT {
if keyspaceInfo.BaseKeyspace == "" {
return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace))
}
keyspace = keyspaceInfo.BaseKeyspace
log.Infof("Using base_keyspace %v to restore keyspace %v", keyspace, tablet.Keyspace)
startTime = logutil.ProtoToTime(keyspaceInfo.SnapshotTime)
log.Infof("Using base_keyspace %v to restore keyspace %v using a backup timestamp of %v", keyspace, tablet.Keyspace, startTime)
}

params := mysqlctl.RestoreParams{
Expand All @@ -173,7 +186,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
DbName: topoproto.TabletDbName(tablet),
Keyspace: keyspace,
Shard: tablet.Shard,
StartTime: logutil.ProtoToTime(keyspaceInfo.SnapshotTime),
StartTime: startTime,
}

// Check whether we're going to restore before changing to RESTORE type,
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ func (tm *TabletManager) Backup(ctx context.Context, concurrency int, logger log
return returnErr
}

// RestoreFromBackup deletes all local data and restores anew from the latest backup.
// RestoreFromBackup deletes all local data and then restores the data from the latest backup.
func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.Logger) error {
if err := tm.lock(ctx); err != nil {
return err
Expand All @@ -169,7 +169,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

// now we can run restore
err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */)
err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, "" /*restoreFromBackupTs */)

// re-run health check to be sure to capture any replication delay
tm.QueryServiceControl.BroadcastHealth()
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context) (bool, error) {

// restoreFromBackup will just be a regular action
// (same as if it was triggered remotely)
if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), *waitForBackupInterval, false /* deleteBeforeRestore */); err != nil {
if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), *waitForBackupInterval, false /* deleteBeforeRestore */, *restoreFromBackupTs); err != nil {
log.Exitf("RestoreFromBackup failed: %v", err)
}
}()
Expand Down
12 changes: 6 additions & 6 deletions go/vt/wrangler/testlib/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ func TestBackupRestore(t *testing.T) {
RelayLogInfoPath: path.Join(root, "relay-log.info"),
}

require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */))
require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */))
// verify the full status
require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed")
assert.True(t, destTablet.FakeMysqlDaemon.Replicating)
Expand Down Expand Up @@ -224,7 +224,7 @@ func TestBackupRestore(t *testing.T) {
primary.FakeMysqlDaemon.SetReplicationPositionPos = primary.FakeMysqlDaemon.CurrentPrimaryPosition

// restore primary from backup
require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */), "RestoreData failed")
require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */), "RestoreData failed")
// tablet was created as PRIMARY, so it's baseTabletType is PRIMARY
assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type)
assert.False(t, primary.FakeMysqlDaemon.Replicating)
Expand All @@ -238,7 +238,7 @@ func TestBackupRestore(t *testing.T) {
"SHOW TABLES FROM `vt_test_keyspace`": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("a")}}},
}

require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */), "RestoreData failed")
require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */), "RestoreData failed")
// Tablet type should not change
assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type)
assert.False(t, primary.FakeMysqlDaemon.Replicating)
Expand Down Expand Up @@ -416,7 +416,7 @@ func TestBackupRestoreLagged(t *testing.T) {

errCh = make(chan error, 1)
go func(ctx context.Context, tablet *FakeTablet) {
errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */)
errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */)
}(ctx, destTablet)

timer = time.NewTicker(1 * time.Second)
Expand Down Expand Up @@ -588,7 +588,7 @@ func TestRestoreUnreachablePrimary(t *testing.T) {
// set a short timeout so that we don't have to wait 30 seconds
*topo.RemoteOperationTimeout = 2 * time.Second
// Restore should still succeed
require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */))
require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */))
// verify the full status
require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed")
assert.True(t, destTablet.FakeMysqlDaemon.Replicating)
Expand Down Expand Up @@ -739,7 +739,7 @@ func TestDisableActiveReparents(t *testing.T) {
RelayLogInfoPath: path.Join(root, "relay-log.info"),
}

require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */))
require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */))
// verify the full status
require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed")
assert.False(t, destTablet.FakeMysqlDaemon.Replicating)
Expand Down

0 comments on commit 50938fd

Please sign in to comment.