Skip to content

Commit

Permalink
Fix restore from backup execution path to use context from caller (#1…
Browse files Browse the repository at this point in the history
…2828)

* fix error reporting

Signed-off-by: Rameez Sajwani <[email protected]>

* adding test

Signed-off-by: Rameez Sajwani <[email protected]>

* fixing bug #12830

Signed-off-by: Rameez Sajwani <[email protected]>

* fixing some error message

Signed-off-by: Rameez Sajwani <[email protected]>

* remove unwanted comments

Signed-off-by: Rameez Sajwani <[email protected]>

* remove unwanted require in test

Signed-off-by: Rameez Sajwani <[email protected]>

* fix comments

Signed-off-by: Rameez Sajwani <[email protected]>

* adding release-notes

Signed-off-by: Rameez Sajwani <[email protected]>

* feedback for summary.md

Signed-off-by: Rameez Sajwani <[email protected]>

* run tablet type with background context

Signed-off-by: Rameez Sajwani <[email protected]>

* Add check for tablet type

Signed-off-by: Rameez Sajwani <[email protected]>

* fixing typo

Signed-off-by: Rameez Sajwani <[email protected]>

* fix test bug

Signed-off-by: Rameez Sajwani <[email protected]>

---------

Signed-off-by: Rameez Sajwani <[email protected]>
  • Loading branch information
rsajwani authored Apr 20, 2023
1 parent d1c332e commit a836318
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 38 deletions.
8 changes: 8 additions & 0 deletions changelog/17.0/17.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- [VTAdmin web migrated from create-react-app to vite](#migrated-vtadmin)
- [Keyspace name validation in TopoServer](#keyspace-name-validation)
- [Shard name validation in TopoServer](#shard-name-validation)
- [VtctldClient command RestoreFromBackup will now use the correct context](#VtctldClient-RestoreFromBackup)
- **[New command line flags and behavior](#new-flag)**
- [Builtin backup: read buffering flags](#builtin-backup-read-buffering-flags)
- **[New stats](#new-stats)**
Expand Down Expand Up @@ -76,6 +77,13 @@ Prior to v17, it was possible to create a shard name with invalid characters, wh

Shard names may no longer contain the forward slash ("/") character, and TopoServer's `CreateShard` method returns an error if given such a name.

#### <a id="VtctldClient-RestoreFromBackup"> VtctldClient command RestoreFromBackup will now use the correct context

The VtctldClient command RestoreFromBackup initiates an asynchronous process on the specified tablet to restore data from either the latest backup or the closest one before the specified backup-timestamp.
Prior to v17, this asynchronous process could run indefinitely in the background since it was called using the background context. In v17 [PR#12830](https://github.com/vitessio/vitess/issues/12830),
this behavior was changed to use a context with a timeout of `action_timeout`. If you are using VtctldClient to initiate a restore, make sure you provide an appropriate value for action_timeout to give enough
time for the restore process to complete. Otherwise, the restore will throw an error if the context expires before it completes.

### <a id="new-flag"/> New command line flags and behavior

#### <a id="builtin-backup-read-buffering-flags" /> Backup --builtinbackup-file-read-buffer-size and --builtinbackup-file-write-buffer-size
Expand Down
68 changes: 60 additions & 8 deletions go/test/endtoend/backup/vtctlbackup/backup_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -751,9 +751,15 @@ func terminatedRestore(t *testing.T) {
// previous test to complete (suspicion: MySQL does not fully start)
time.Sleep(5 * time.Second)

checkTabletType(t, replica1.Alias, topodata.TabletType_REPLICA)
terminateBackup(t, replica1.Alias)
// If backup fails then the tablet type goes back to original type.
checkTabletType(t, replica1.Alias, topodata.TabletType_REPLICA)

// backup the replica
err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias)
require.Nil(t, err)
checkTabletType(t, replica1.Alias, topodata.TabletType_REPLICA)

verifyTabletBackupStats(t, replica1.VttabletProcess.GetVars())

Expand All @@ -771,18 +777,14 @@ func terminatedRestore(t *testing.T) {
_, err = replica1.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true)
require.Nil(t, err)

checkTabletType(t, primary.Alias, topodata.TabletType_REPLICA)
terminateRestore(t)
// If restore fails then the tablet type goes back to original type.
checkTabletType(t, primary.Alias, topodata.TabletType_REPLICA)

err = localCluster.VtctlclientProcess.ExecuteCommand("RestoreFromBackup", primary.Alias)
require.Nil(t, err)

output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", primary.Alias)
require.Nil(t, err)

var tabletPB topodata.Tablet
err = json.Unmarshal([]byte(output), &tabletPB)
require.Nil(t, err)
assert.Equal(t, tabletPB.Type, topodata.TabletType_REPLICA)
checkTabletType(t, primary.Alias, topodata.TabletType_REPLICA)

_, err = os.Stat(path.Join(primary.VttabletProcess.Directory, "restore_in_progress"))
assert.True(t, os.IsNotExist(err))
Expand All @@ -794,6 +796,22 @@ func terminatedRestore(t *testing.T) {
stopAllTablets()
}

func checkTabletType(t *testing.T, alias string, tabletType topodata.TabletType) {
// for loop for 15 seconds to check if tablet type is correct
for i := 0; i < 15; i++ {
output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", alias)
require.Nil(t, err)
var tabletPB topodata.Tablet
err = json.Unmarshal([]byte(output), &tabletPB)
require.Nil(t, err)
if tabletType == tabletPB.Type {
return
}
time.Sleep(1 * time.Second)
}
require.Failf(t, "checkTabletType failed.", "Tablet type is not correct. Expected: %v", tabletType)
}

// test_backup will:
// - create a shard with primary and replica1 only
// - run InitShardPrimary
Expand Down Expand Up @@ -927,6 +945,40 @@ func verifySemiSyncStatus(t *testing.T, vttablet *cluster.Vttablet, expectedStat
assert.Equal(t, status, expectedStatus)
}

func terminateBackup(t *testing.T, alias string) {
stopBackupMsg := "Done taking Backup"
if useXtrabackup {
stopBackupMsg = "Starting backup with"
useXtrabackup = false
defer func() {
useXtrabackup = true
}()
}

args := append([]string{"--server", localCluster.VtctlclientProcess.Server, "--alsologtostderr"}, "Backup", "--", alias)
tmpProcess := exec.Command(
"vtctlclient",
args...,
)

reader, _ := tmpProcess.StderrPipe()
err := tmpProcess.Start()
require.Nil(t, err)
found := false

scanner := bufio.NewScanner(reader)

for scanner.Scan() {
text := scanner.Text()
if strings.Contains(text, stopBackupMsg) {
tmpProcess.Process.Signal(syscall.SIGTERM)
found = true //nolint
return
}
}
assert.True(t, found, "backup message not found")
}

func terminateRestore(t *testing.T) {
stopRestoreMsg := "Copying file 10"
if useXtrabackup {
Expand Down
14 changes: 8 additions & 6 deletions go/vt/mysqlctl/builtinbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,13 +527,14 @@ func (be *BuiltinBackupEngine) backupFiles(
// unpredictability in my test cases, so in order to avoid that, I am adding this cancellation check.
select {
case <-ctx.Done():
log.Errorf("Context cancelled during %q backup", fe.Name)
log.Errorf("Context canceled or timed out during %q backup", fe.Name)
bh.RecordError(vterrors.Errorf(vtrpc.Code_CANCELED, "context canceled"))
return
default:
}

if bh.HasErrors() {
params.Logger.Infof("failed to backup files due to error.")
return
}

Expand Down Expand Up @@ -803,7 +804,7 @@ func (be *BuiltinBackupEngine) executeRestoreFullBackup(ctx context.Context, par

params.Logger.Infof("Restore: copying %v files", len(bm.FileEntries))

if _, err := be.restoreFiles(context.Background(), params, bh, bm); err != nil {
if _, err := be.restoreFiles(ctx, params, bh, bm); err != nil {
// don't delete the file here because that is how we detect an interrupted restore
return vterrors.Wrap(err, "failed to restore files")
}
Expand All @@ -816,7 +817,7 @@ func (be *BuiltinBackupEngine) executeRestoreFullBackup(ctx context.Context, par
// The underlying mysql database is expected to be up and running.
func (be *BuiltinBackupEngine) executeRestoreIncrementalBackup(ctx context.Context, params RestoreParams, bh backupstorage.BackupHandle, bm builtinBackupManifest) error {
params.Logger.Infof("Restoring incremental backup to position: %v", bm.Position)
createdDir, err := be.restoreFiles(context.Background(), params, bh, bm)
createdDir, err := be.restoreFiles(ctx, params, bh, bm)
defer os.RemoveAll(createdDir)
mysqld, ok := params.Mysqld.(*Mysqld)
if !ok {
Expand Down Expand Up @@ -902,7 +903,7 @@ func (be *BuiltinBackupEngine) restoreFiles(ctx context.Context, params RestoreP
// Wait until we are ready to go, return if we encounter an error
acqErr := sema.Acquire(ctx, 1)
if acqErr != nil {
log.Errorf("Unable to acquire semaphore needed to backup file: %s, err: %s", fe.Name, acqErr.Error())
log.Errorf("Unable to acquire semaphore needed to restore file: %s, err: %s", fe.Name, acqErr.Error())
rec.RecordError(acqErr)
return
}
Expand All @@ -913,13 +914,14 @@ func (be *BuiltinBackupEngine) restoreFiles(ctx context.Context, params RestoreP
// unpredictability in my test cases, so in order to avoid that, I am adding this cancellation check.
select {
case <-ctx.Done():
log.Errorf("Context cancelled during %q backup", fe.Name)
bh.RecordError(vterrors.Errorf(vtrpc.Code_CANCELED, "context canceled"))
log.Errorf("Context canceled or timed out during %q restore", fe.Name)
rec.RecordError(vterrors.Errorf(vtrpc.Code_CANCELED, "context canceled"))
return
default:
}

if rec.HasErrors() {
params.Logger.Infof("Failed to restore files due to error.")
return
}

Expand Down
152 changes: 133 additions & 19 deletions go/vt/mysqlctl/builtinbackupengine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"os"
"path"
"strings"
"testing"
"time"

Expand All @@ -38,8 +39,6 @@ import (
"vitess.io/vitess/go/vt/proto/vttime"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/memorytopo"
"vitess.io/vitess/go/vt/vttablet/faketmclient"
"vitess.io/vitess/go/vt/vttablet/tmclient"
)

func setBuiltinBackupMysqldDeadline(t time.Duration) time.Duration {
Expand Down Expand Up @@ -113,12 +112,6 @@ func TestExecuteBackup(t *testing.T) {
})
require.NoError(t, err)

// Set up tm client
// Note that using faketmclient.NewFakeTabletManagerClient will cause infinite recursion :shrug:
tmclient.RegisterTabletManagerClientFactory("grpc",
func() tmclient.TabletManagerClient { return &faketmclient.FakeTabletManagerClient{} },
)

be := &mysqlctl.BuiltinBackupEngine{}

// Configure a tight deadline to force a timeout
Expand Down Expand Up @@ -171,10 +164,9 @@ func TestExecuteBackup(t *testing.T) {
assert.False(t, ok)
}

// TestExecuteBackupWithCancelledContext test the ability of backup to gracefully
// handle cases where we encounter error due to any reasons for e.g context cancel etc.
// Process should not panic in these situations.
func TestExecuteBackupWithCancelledContext(t *testing.T) {
// TestExecuteBackupWithCanceledContext tests the ability of the backup function to gracefully handle cases where errors
// occur due to various reasons, such as context time cancel. The process should not panic in these situations.
func TestExecuteBackupWithCanceledContext(t *testing.T) {
// Set up local backup directory
id := fmt.Sprintf("%d", time.Now().UnixNano())
backupRoot := fmt.Sprintf("testdata/builtinbackup_test_%s", id)
Expand All @@ -189,7 +181,7 @@ func TestExecuteBackupWithCancelledContext(t *testing.T) {
require.NoError(t, createBackupFiles(path.Join(dataDir, "test2"), 2, "ibd"))
defer os.RemoveAll(backupRoot)

// cancel the context deliberately
// Cancel the context deliberately
ctx, cancel := context.WithCancel(context.Background())
cancel()
needIt, err := needInnoDBRedoLogSubdir()
Expand Down Expand Up @@ -225,12 +217,6 @@ func TestExecuteBackupWithCancelledContext(t *testing.T) {
})
require.NoError(t, err)

// Set up tm client.
// Note that using faketmclient.NewFakeTabletManagerClient will cause infinite recursion :shrug:
tmclient.RegisterTabletManagerClientFactory("grpc2",
func() tmclient.TabletManagerClient { return &faketmclient.FakeTabletManagerClient{} },
)

be := &mysqlctl.BuiltinBackupEngine{}
bh := filebackupstorage.NewBackupHandle(nil, "", "", false)
// Spin up a fake daemon to be used in backups. It needs to be allowed to receive:
Expand Down Expand Up @@ -260,6 +246,134 @@ func TestExecuteBackupWithCancelledContext(t *testing.T) {
assert.False(t, ok)
}

// TestExecuteRestoreWithCanceledContext tests the ability of the restore function to gracefully handle cases where errors
// occur due to various reasons, such as context timed-out. The process should not panic in these situations.
func TestExecuteRestoreWithTimedOutContext(t *testing.T) {
// Set up local backup directory
id := fmt.Sprintf("%d", time.Now().UnixNano())
backupRoot := fmt.Sprintf("testdata/builtinbackup_test_%s", id)
filebackupstorage.FileBackupStorageRoot = backupRoot
require.NoError(t, createBackupDir(backupRoot, "innodb", "log", "datadir"))
dataDir := path.Join(backupRoot, "datadir")
// Add some files under data directory to force backup to execute semaphore acquire inside
// backupFiles() method (https://github.com/vitessio/vitess/blob/main/go/vt/mysqlctl/builtinbackupengine.go#L483).
require.NoError(t, createBackupDir(dataDir, "test1"))
require.NoError(t, createBackupDir(dataDir, "test2"))
require.NoError(t, createBackupFiles(path.Join(dataDir, "test1"), 2, "ibd"))
require.NoError(t, createBackupFiles(path.Join(dataDir, "test2"), 2, "ibd"))
defer os.RemoveAll(backupRoot)

ctx := context.Background()
needIt, err := needInnoDBRedoLogSubdir()
require.NoError(t, err)
if needIt {
fpath := path.Join("log", mysql.DynamicRedoLogSubdir)
if err := createBackupDir(backupRoot, fpath); err != nil {
require.Failf(t, err.Error(), "failed to create directory: %s", fpath)
}
}

// Set up topo
keyspace, shard := "mykeyspace", "-80"
ts := memorytopo.NewServer("cell1")
defer ts.Close()

require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodata.Keyspace{}))
require.NoError(t, ts.CreateShard(ctx, keyspace, shard))

tablet := topo.NewTablet(100, "cell1", "mykeyspace-00-80-0100")
tablet.Keyspace = keyspace
tablet.Shard = shard

require.NoError(t, ts.CreateTablet(ctx, tablet))

_, err = ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error {
si.PrimaryAlias = &topodata.TabletAlias{Uid: 100, Cell: "cell1"}

now := time.Now()
si.PrimaryTermStartTime = &vttime.Time{Seconds: int64(now.Second()), Nanoseconds: int32(now.Nanosecond())}

return nil
})
require.NoError(t, err)

be := &mysqlctl.BuiltinBackupEngine{}
bh := filebackupstorage.NewBackupHandle(nil, "", "", false)
// Spin up a fake daemon to be used in backups. It needs to be allowed to receive:
// "STOP SLAVE", "START SLAVE", in that order.
mysqld := mysqlctl.NewFakeMysqlDaemon(fakesqldb.New(t))
mysqld.ExpectedExecuteSuperQueryList = []string{"STOP SLAVE", "START SLAVE"}

ok, err := be.ExecuteBackup(ctx, mysqlctl.BackupParams{
Logger: logutil.NewConsoleLogger(),
Mysqld: mysqld,
Cnf: &mysqlctl.Mycnf{
InnodbDataHomeDir: path.Join(backupRoot, "innodb"),
InnodbLogGroupHomeDir: path.Join(backupRoot, "log"),
DataDir: path.Join(backupRoot, "datadir"),
},
Stats: backupstats.NewFakeStats(),
Concurrency: 2,
HookExtraEnv: map[string]string{},
TopoServer: ts,
Keyspace: keyspace,
Shard: shard,
}, bh)

require.NoError(t, err)
assert.True(t, ok)

// Now try to restore the above backup.
bh = filebackupstorage.NewBackupHandle(nil, "", "", true)
mysqld = mysqlctl.NewFakeMysqlDaemon(fakesqldb.New(t))
mysqld.ExpectedExecuteSuperQueryList = []string{"STOP SLAVE", "START SLAVE"}
restoreParams := mysqlctl.RestoreParams{
Cnf: &mysqlctl.Mycnf{
InnodbDataHomeDir: path.Join(backupRoot, "innodb"),
InnodbLogGroupHomeDir: path.Join(backupRoot, "log"),
DataDir: path.Join(backupRoot, "datadir"),
BinLogPath: path.Join(backupRoot, "binlog"),
RelayLogPath: path.Join(backupRoot, "relaylog"),
RelayLogIndexPath: path.Join(backupRoot, "relaylogindex"),
RelayLogInfoPath: path.Join(backupRoot, "relayloginfo"),
},
Logger: logutil.NewConsoleLogger(),
Mysqld: mysqld,
Concurrency: 2,
HookExtraEnv: map[string]string{},
DeleteBeforeRestore: false,
DbName: "test",
Keyspace: "test",
Shard: "-",
StartTime: time.Now(),
RestoreToPos: mysql.Position{},
DryRun: false,
Stats: backupstats.NewFakeStats(),
}

// Successful restore.
bm, err := be.ExecuteRestore(ctx, restoreParams, bh)
assert.NoError(t, err)
assert.NotNil(t, bm)

// Restore using timed-out context
mysqld = mysqlctl.NewFakeMysqlDaemon(fakesqldb.New(t))
mysqld.ExpectedExecuteSuperQueryList = []string{"STOP SLAVE", "START SLAVE"}
restoreParams.Mysqld = mysqld
timedOutCtx, cancel := context.WithTimeout(ctx, 1*time.Second)
defer cancel()
// Let the context time out.
time.Sleep(1 * time.Second)
bm, err = be.ExecuteRestore(timedOutCtx, restoreParams, bh)
// ExecuteRestore should fail.
assert.Error(t, err)
assert.Nil(t, bm)
// error message can contain any combination of "context deadline exceeded" or "context canceled"
if !strings.Contains(err.Error(), "context canceled") && !strings.Contains(err.Error(), "context deadline exceeded") {
assert.Fail(t, "Test should fail with either `context canceled` or `context deadline exceeded`")
}
}

// needInnoDBRedoLogSubdir indicates whether we need to create a redo log subdirectory.
// Starting with MySQL 8.0.30, the InnoDB redo logs are stored in a subdirectory of the
// <innodb_log_group_home_dir> (<datadir>/. by default) called "#innodb_redo". See:
Expand Down
2 changes: 1 addition & 1 deletion go/vt/mysqlctl/mysqld.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ func (mysqld *Mysqld) startNoWait(ctx context.Context, cnf *Mycnf, mysqldArgs ..
err := cmd.Wait()
log.Infof("%v exit: %v", ts, err)

// The process exited. Trigger OnTerm callbacks, unless we were cancelled.
// The process exited. Trigger OnTerm callbacks, unless we were canceled.
select {
case <-cancel:
default:
Expand Down
Loading

0 comments on commit a836318

Please sign in to comment.