Skip to content

Commit

Permalink
release-22.1: backupccl: introduce BACKUP-LOCK file
Browse files Browse the repository at this point in the history
Only one backup job is allowed to write to a backup location.
Prior to this change, the backup job would rely on the presence of
a BACKUP-CHECKPOINT file to know of the presence of a concurrent backup
job writing to the same location. This was problematic in subtle ways.

In 22.1, we moved backup destination resolution, and the writing of the
checkpoint file to the backup resumer. Before writing the checkpoint file we
would check if anyone else had laid claim to the loxation. Now, all operations
in a job resumer need to be idempotent because a job can be resumed an
arbittrary number of times, either due to transient errors or user intervention.
One can imagine (and we have seen more than once in recent roachtests)
a situation where a job:

1. Checks for other BACKUP-CHECKPOINT files in the location, but finds none.
2. Writes its own BACKUP-CHECKPOINT file.
3. Gets resumed before it gets to update BackupDetails to indicate it has
completed 1) and 2).

So, when the job repeats 1), it will now see its own BACKUP-CHECKPOINT
file and claim another backup is writing to the location, foolishly locking
itself out.

A similar situation can happen in a mixed version state where the node performs
1) and 2) during planning, and the planner txn retries.

Before we discuss the solution it is important to highlight the mixed
version states to consider:

1) Backups planned/executed by 21.2.x and 22.1.0 nodes will continue
to check BACKUP-CHECKPOINT files before laying claim to a location.

2) Backups planned/executed by 21.2.x and 22.1.0 nodes will continue
to write BACKUP-CHECKPOINT files as their way of claiming a location.

This change introduces a `BACKUP-LOCK` file that going forward will be
used to check and lay claim on a location. The `BACKUP-LOCK` file will
be suffixed with the jobID of the backup job. With this change a backup job will
check for the existence of `BACKUP-LOCK` files suffixed with a job ID other
than their own, before laying claim to a location. We continue to read
the BACKUP-CHECKPOINT file so as to respect the claim laid by backups
started on older binary nodes. Naturally, the job also continues to write
a BACKUP-CHECKPOINT file which prevents older nodes from starting concurrent
backups.

Fixes: cockroachdb#81808

Release note: None
  • Loading branch information
adityamaru committed May 31, 2022
1 parent 9549e03 commit c770ee3
Show file tree
Hide file tree
Showing 10 changed files with 637 additions and 144 deletions.
127 changes: 69 additions & 58 deletions pkg/ccl/backupccl/backup_destination.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ package backupccl

import (
"context"
"encoding/hex"
"fmt"
"net/url"
"strings"

Expand All @@ -27,12 +25,10 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/ioctx"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -67,6 +63,27 @@ func fetchPreviousBackups(
return prevBackups, encryptionOptions, size, nil
}

// backupDestination encapsulates information that is populated while resolving
// the destination of a backup.
type backupDestination struct {
// collectionURI is the URI pointing to the backup collection.
collectionURI string

// defaultURI is the full path of the defaultURI of the backup.
defaultURI string

// chosenSubdir is the automatically chosen suffix within the collection path
// if we're backing up INTO a collection.
chosenSubdir string

// urisByLocalityKV is a mapping from the locality tag to the corresponding
// locality aware backup URI.
urisByLocalityKV map[string]string

// prevBackupURIs is the list of full paths for previous backups in the chain.
prevBackupURIs []string
}

// resolveDest resolves the true destination of a backup. The backup command
// provided by the user may point to a backup collection, or a backup location
// which auto-appends incremental backups to it. This method checks for these
Expand All @@ -83,70 +100,63 @@ func resolveDest(
endTime hlc.Timestamp,
incrementalFrom []string,
execCfg *sql.ExecutorConfig,
) (
collectionURI string,
plannedBackupDefaultURI string, /* the full path for the planned backup */
/* chosenSuffix is the automatically chosen suffix within the collection path
if we're backing up INTO a collection. */
chosenSuffix string,
urisByLocalityKV map[string]string,
prevBackupURIs []string, /* list of full paths for previous backups in the chain */
err error,
) {
) (backupDestination, error) {
makeCloudStorage := execCfg.DistSQLSrv.ExternalStorageFromURI

defaultURI, _, err := getURIsByLocalityKV(dest.To, "")
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}

chosenSuffix = dest.Subdir

var collectionURI string
chosenSuffix := dest.Subdir
if chosenSuffix != "" {
// The legacy backup syntax, BACKUP TO, leaves the dest.Subdir and collection parameters empty.
collectionURI = defaultURI

if chosenSuffix == latestFileName {
latest, err := readLatestFile(ctx, defaultURI, makeCloudStorage, user)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
chosenSuffix = latest
}
}

plannedBackupDefaultURI, urisByLocalityKV, err = getURIsByLocalityKV(dest.To, chosenSuffix)
plannedBackupDefaultURI, urisByLocalityKV, err := getURIsByLocalityKV(dest.To, chosenSuffix)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}

// At this point, the plannedBackupDefaultURI is the full path for the backup. For BACKUP
// INTO, this path includes the chosenSuffix. Once this function returns, the
// plannedBackupDefaultURI will be the full path for this backup in planning.
// At this point, the defaultURI is the full path for the backup. For BACKUP
// INTO, this path includes the chosenSubdir. Once this function returns, the
// defaultURI will be the full path for this backup in planning.
if len(incrementalFrom) != 0 {
// Legacy backup with deprecated BACKUP TO-syntax.
prevBackupURIs = incrementalFrom
return collectionURI, plannedBackupDefaultURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, nil
prevBackupURIs := incrementalFrom
return backupDestination{
collectionURI: collectionURI,
defaultURI: plannedBackupDefaultURI,
chosenSubdir: chosenSuffix,
urisByLocalityKV: urisByLocalityKV,
prevBackupURIs: prevBackupURIs,
}, nil
}

defaultStore, err := makeCloudStorage(ctx, plannedBackupDefaultURI, user)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
defer defaultStore.Close()
exists, err := containsManifest(ctx, defaultStore)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
if exists && !dest.Exists && chosenSuffix != "" && execCfg.Settings.Version.IsActive(ctx,
clusterversion.Start22_1) {
if exists && !dest.Exists && chosenSuffix != "" &&
execCfg.Settings.Version.IsActive(ctx, clusterversion.Start22_1) {
// We disallow a user from writing a full backup to a path in a collection containing an
// existing backup iff we're 99.9% confident this backup was planned on a 22.1 node.
return "",
"",
"",
nil,
nil,
return backupDestination{},
errors.Newf("A full backup already exists in %s. "+
"Consider running an incremental backup to this full backup via `BACKUP INTO '%s' IN '%s'`",
plannedBackupDefaultURI, chosenSuffix, dest.To[0])
Expand All @@ -168,7 +178,7 @@ func resolveDest(
featureFullBackupUserSubdir,
"'Full Backup with user defined subdirectory'",
); err != nil {
return "", "", "", nil, nil, errors.Wrapf(err,
return backupDestination{}, errors.Wrapf(err,
"The full backup cannot get written to '%s', a user defined subdirectory. "+
"To take a full backup, remove the subdirectory from the backup command, "+
"(i.e. run 'BACKUP ... INTO <collectionURI>'). "+
Expand All @@ -179,7 +189,13 @@ func resolveDest(
}
}
// There's no full backup in the resolved subdirectory; therefore, we're conducting a full backup.
return collectionURI, plannedBackupDefaultURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, nil
return backupDestination{
collectionURI: collectionURI,
defaultURI: plannedBackupDefaultURI,
chosenSubdir: chosenSuffix,
urisByLocalityKV: urisByLocalityKV,
prevBackupURIs: nil,
}, nil
}

// The defaultStore contains a full backup; consequently, we're conducting an incremental backup.
Expand All @@ -191,42 +207,48 @@ func resolveDest(
dest.To,
chosenSuffix)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}

priorsDefaultURI, _, err := getURIsByLocalityKV(fullyResolvedIncrementalsLocation, "")
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
incrementalStore, err := makeCloudStorage(ctx, priorsDefaultURI, user)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
defer incrementalStore.Close()

priors, err := FindPriorBackups(ctx, incrementalStore, OmitManifest)
if err != nil {
return "", "", "", nil, nil, errors.Wrap(err, "adjusting backup destination to append new layer to existing backup")
return backupDestination{}, errors.Wrap(err, "adjusting backup destination to append new layer to existing backup")
}

prevBackupURIs := make([]string, 0)
for _, prior := range priors {
priorURI, err := url.Parse(priorsDefaultURI)
if err != nil {
return "", "", "", nil, nil, errors.Wrapf(err, "parsing default backup location %s",
priorsDefaultURI)
return backupDestination{}, errors.Wrapf(err, "parsing default backup location %s", priorsDefaultURI)
}
priorURI.Path = JoinURLPath(priorURI.Path, prior)
prevBackupURIs = append(prevBackupURIs, priorURI.String())
}
prevBackupURIs = append([]string{plannedBackupDefaultURI}, prevBackupURIs...)

// Within the chosenSuffix dir, differentiate incremental backups with partName.
// Within the chosenSubdir dir, differentiate incremental backups with partName.
partName := endTime.GoTime().Format(DateBasedIncFolderName)
defaultIncrementalsURI, urisByLocalityKV, err := getURIsByLocalityKV(fullyResolvedIncrementalsLocation, partName)
if err != nil {
return "", "", "", nil, nil, err
return backupDestination{}, err
}
return collectionURI, defaultIncrementalsURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, nil
return backupDestination{
collectionURI: collectionURI,
defaultURI: defaultIncrementalsURI,
chosenSubdir: chosenSuffix,
urisByLocalityKV: urisByLocalityKV,
prevBackupURIs: prevBackupURIs,
}, nil
}

// getBackupManifests fetches the backup manifest from a list of backup URIs.
Expand Down Expand Up @@ -450,17 +472,6 @@ func writeNewLatestFile(
// sorted to the top. This will be the last latest file we write. It
// Takes the one's complement of the timestamp so that files are sorted
// lexicographically such that the most recent is always the top.
return cloud.WriteFile(ctx, exportStore, newTimestampedLatestFileName(), strings.NewReader(suffix))
}

// newTimestampedLatestFileName returns a string of a new latest filename
// with a suffixed version. It returns it in the format of LATEST-<version>
// where version is a hex encoded one's complement of the timestamp.
// This means that as long as the supplied timestamp is correct, the filenames
// will adhere to a lexicographical/utf-8 ordering such that the most
// recent file is at the top.
func newTimestampedLatestFileName() string {
var buffer []byte
buffer = encoding.EncodeStringDescending(buffer, timeutil.Now().String())
return fmt.Sprintf("%s/%s-%s", latestHistoryDirectory, latestFileName, hex.EncodeToString(buffer))
versionedLatestFileName := newTimestampedFilename(latestFileName)
return cloud.WriteFile(ctx, exportStore, latestHistoryDirectory+"/"+versionedLatestFileName, strings.NewReader(suffix))
}
36 changes: 18 additions & 18 deletions pkg/ccl/backupccl/backup_destination_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
defaultDest, localitiesDest, err := getURIsByLocalityKV(to, "")
require.NoError(t, err)

collectionURI, defaultURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, err := resolveDest(
backupDestination, err := resolveDest(
ctx, security.RootUserName(),
jobspb.BackupDetails_Destination{To: to},
endTime,
Expand All @@ -131,12 +131,12 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
require.NoError(t, err)

// Not an INTO backup, so no collection of suffix info.
require.Equal(t, "", collectionURI)
require.Equal(t, "", chosenSuffix)
require.Equal(t, "", backupDestination.collectionURI)
require.Equal(t, "", backupDestination.chosenSubdir)

require.Equal(t, defaultDest, defaultURI)
require.Equal(t, localitiesDest, urisByLocalityKV)
require.Equal(t, incrementalFrom, prevBackupURIs)
require.Equal(t, defaultDest, backupDestination.defaultURI)
require.Equal(t, localitiesDest, backupDestination.urisByLocalityKV)
require.Equal(t, incrementalFrom, backupDestination.prevBackupURIs)
}

// The first initial full backup: BACKUP TO full.
Expand Down Expand Up @@ -188,7 +188,7 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
) {
endTime := hlc.Timestamp{WallTime: backupTime.UnixNano()}

collectionURI, defaultURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, err := resolveDest(
backupDestination, err := resolveDest(
ctx, security.RootUserName(),
jobspb.BackupDetails_Destination{To: to},
endTime,
Expand All @@ -198,11 +198,11 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
require.NoError(t, err)

// Not a backup collection.
require.Equal(t, "", collectionURI)
require.Equal(t, "", chosenSuffix)
require.Equal(t, expectedDefault, defaultURI)
require.Equal(t, expectedLocalities, urisByLocalityKV)
require.Equal(t, expectedPrevBackups, prevBackupURIs)
require.Equal(t, "", backupDestination.collectionURI)
require.Equal(t, "", backupDestination.chosenSubdir)
require.Equal(t, expectedDefault, backupDestination.defaultURI)
require.Equal(t, expectedLocalities, backupDestination.urisByLocalityKV)
require.Equal(t, expectedPrevBackups, backupDestination.prevBackupURIs)
}

// Initial full backup: BACKUP TO baseDir.
Expand Down Expand Up @@ -348,7 +348,7 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
if expectedIncDir != "" {
fullBackupExists = true
}
collectionURI, defaultURI, chosenSuffix, urisByLocalityKV, prevBackupURIs, err := resolveDest(
backupDestination, err := resolveDest(
ctx, security.RootUserName(),
jobspb.BackupDetails_Destination{To: collectionTo, Subdir: subdir,
IncrementalStorage: incrementalTo, Exists: fullBackupExists},
Expand All @@ -368,13 +368,13 @@ func TestBackupRestoreResolveDestination(t *testing.T) {
localityDests[locality] = u.String()
}

require.Equal(t, collectionLoc, collectionURI)
require.Equal(t, expectedSuffix, chosenSuffix)
require.Equal(t, collectionLoc, backupDestination.collectionURI)
require.Equal(t, expectedSuffix, backupDestination.chosenSubdir)

require.Equal(t, expectedDefault, defaultURI)
require.Equal(t, localityDests, urisByLocalityKV)
require.Equal(t, expectedDefault, backupDestination.defaultURI)
require.Equal(t, localityDests, backupDestination.urisByLocalityKV)

require.Equal(t, expectedPrevBackups, prevBackupURIs)
require.Equal(t, expectedPrevBackups, backupDestination.prevBackupURIs)
}

// Initial: BACKUP INTO collection
Expand Down
Loading

0 comments on commit c770ee3

Please sign in to comment.