Skip to content

Commit

Permalink
Fix Google Cloud races (#5081)
Browse files Browse the repository at this point in the history
* storage/gcs: fix race condition in releasing lock

Previously we were deleting a lock without first checking if the lock we were deleting was our own. There existed a small period of time where vault-0 would lose leadership and vault-1 would get leadership. vault-0 would delete the lock key while vault-1 would write it. If vault-0 won, there'd be another leader election, etc.

This fixes the race by using a CAS operation instead.

* storage/gcs: properly break out of loop during stop

* storage/spanner: properly break out of loop during stop
  • Loading branch information
sethvargo authored and jefferai committed Aug 14, 2018
1 parent 8754694 commit 19f1a94
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 9 deletions.
38 changes: 32 additions & 6 deletions physical/gcs/gcs_ha.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,30 @@ func (l *Lock) Unlock() error {
}
l.stopLock.Unlock()

// Delete
// Read the record value before deleting. This needs to be a CAS operation or
// else we might be deleting someone else's lock.
ctx := context.Background()
if err := l.backend.Delete(ctx, l.key); err != nil {
return err
r, err := l.get(ctx)
if err != nil {
return errwrap.Wrapf("failed to read lock for deletion: {{err}}", err)
}
if r != nil && r.Identity == l.identity {
ctx := context.Background()
conds := storage.Conditions{
GenerationMatch: r.attrs.Generation,
MetagenerationMatch: r.attrs.Metageneration,
}

obj := l.backend.client.Bucket(l.backend.bucket).Object(l.key)
if err := obj.If(conds).Delete(ctx); err != nil {
// If the pre-condition failed, it means that someone else has already
// acquired the lock and we don't want to delete it.
if terr, ok := err.(*googleapi.Error); ok && terr.Code == 412 {
l.backend.logger.Debug("unlock: preconditions failed (lock already taken by someone else?)")
} else {
return errwrap.Wrapf("failed to delete lock: {{err}}", err)
}
}
}

// We are no longer holding the lock
Expand Down Expand Up @@ -254,20 +274,26 @@ func (l *Lock) watchLock() {
retries := 0
ticker := time.NewTicker(l.watchRetryInterval)

OUTER:
for {
// Check if the channel is already closed
select {
case <-l.stopCh:
break OUTER
default:
}

// Check if we've exceeded retries
if retries >= l.watchRetryMax-1 {
break
break OUTER
}

// Wait for the timer
<-ticker.C
select {
case <-ticker.C:
case <-l.stopCh:
break OUTER
}

// Attempt to read the key
r, err := l.get(context.Background())
Expand All @@ -278,7 +304,7 @@ func (l *Lock) watchLock() {

// Verify the identity is the same
if r == nil || r.Identity != l.identity {
break
break OUTER
}
}

Expand Down
12 changes: 9 additions & 3 deletions physical/spanner/spanner_ha.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,20 +277,26 @@ func (l *Lock) watchLock() {
retries := 0
ticker := time.NewTicker(l.watchRetryInterval)

OUTER:
for {
// Check if the channel is already closed
select {
case <-l.stopCh:
break OUTER
default:
}

// Check if we've exceeded retries
if retries >= l.watchRetryMax-1 {
break
break OUTER
}

// Wait for the timer
<-ticker.C
select {
case <-ticker.C:
case <-l.stopCh:
break OUTER
}

// Attempt to read the key
r, err := l.get(context.Background())
Expand All @@ -301,7 +307,7 @@ func (l *Lock) watchLock() {

// Verify the identity is the same
if r == nil || r.Identity != l.identity {
break
break OUTER
}
}

Expand Down

0 comments on commit 19f1a94

Please sign in to comment.