Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix shard close error type #3215

Merged
merged 2 commits into from
Aug 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion service/history/queueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ eventLoop:
p.options.UpdateAckInterval(),
p.options.UpdateAckIntervalJitterCoefficient(),
))
if err := p.ackMgr.updateQueueAckLevel(); err == shard.ErrShardClosed {
if err := p.ackMgr.updateQueueAckLevel(); shard.IsShardOwnershipLostError(err) {
// shard is no longer owned by this instance, bail out
go p.Stop()
break eventLoop
Expand Down
15 changes: 10 additions & 5 deletions service/history/shard/context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,6 @@ type (
var _ Context = (*ContextImpl)(nil)

var (
// ErrShardClosed is returned when shard is closed and a req cannot be processed
ErrShardClosed = serviceerror.NewUnavailable("shard closed")

// ErrShardStatusUnknown means we're not sure if we have the shard lock or not. This may be returned
// during short windows at initialization and if we've lost the connection to the database.
ErrShardStatusUnknown = serviceerror.NewUnavailable("shard status unknown")
Expand Down Expand Up @@ -1129,7 +1126,7 @@ func (s *ContextImpl) errorByState() error {
case contextStateAcquired:
return nil
case contextStateStopping, contextStateStopped:
return ErrShardClosed
return s.newShardClosedErrorWithShardID()
default:
panic("invalid state")
}
Expand Down Expand Up @@ -1810,7 +1807,7 @@ func (s *ContextImpl) acquireShard() {

op := func() error {
if !s.isValid() {
return ErrShardClosed
return s.newShardClosedErrorWithShardID()
}

// Initial load of shard metadata
Expand Down Expand Up @@ -2038,6 +2035,14 @@ func (s *ContextImpl) newIOContext() (context.Context, context.CancelFunc) {
return ctx, cancel
}

// newShardClosedErrorWithShardID when shard is closed and a req cannot be processed
func (s *ContextImpl) newShardClosedErrorWithShardID() *persistence.ShardOwnershipLostError {
return &persistence.ShardOwnershipLostError{
ShardID: s.shardID, // immutable
Msg: "shard closed",
}
}

func OperationPossiblySucceeded(err error) bool {
switch err.(type) {
case *persistence.CurrentWorkflowConditionFailedError,
Expand Down
13 changes: 6 additions & 7 deletions service/history/shard/controller_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,10 @@ func (c *ControllerImpl) removeShard(shardID int32, expected *ContextImpl) (*Con
// ControllerImpl. It is responsible for acquiring /
// releasing shards in response to any event that can
// change the shard ownership. These events are
// a. Ring membership change
// b. Periodic ticker
// c. ShardOwnershipLostError and subsequent ShardClosedEvents from engine
//
// a. Ring membership change
// b. Periodic ticker
// c. ShardOwnershipLostError and subsequent ShardClosedEvents from engine
func (c *ControllerImpl) shardManagementPump() {
defer c.shutdownWG.Done()

Expand Down Expand Up @@ -423,13 +424,11 @@ func (c *ControllerImpl) ShardIDs() []int32 {
}

func IsShardOwnershipLostError(err error) bool {
if err == ErrShardClosed {
return true
}

switch err.(type) {
case *persistence.ShardOwnershipLostError:
return true
case *serviceerrors.ShardOwnershipLost:
return true
}

return false
Expand Down
4 changes: 2 additions & 2 deletions service/history/timerQueueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,8 @@ func (t *timerQueueProcessorImpl) completeTimersLoop() {
return false
default:
}
return err != shard.ErrShardClosed
}); err == shard.ErrShardClosed {
return !shard.IsShardOwnershipLostError(err)
}); shard.IsShardOwnershipLostError(err) {
// shard is unloaded, timer processor should quit as well
go t.Stop()
return
Expand Down
2 changes: 1 addition & 1 deletion service/history/timerQueueProcessorBase.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ eventLoop:
t.config.TimerProcessorUpdateAckInterval(),
t.config.TimerProcessorUpdateAckIntervalJitterCoefficient(),
))
if err := t.timerQueueAckMgr.updateAckLevel(); err == shard.ErrShardClosed {
if err := t.timerQueueAckMgr.updateAckLevel(); shard.IsShardOwnershipLostError(err) {
// shard is closed, shutdown timerQProcessor and bail out
go t.Stop()
return err
Expand Down
4 changes: 2 additions & 2 deletions service/history/transferQueueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ func (t *transferQueueProcessorImpl) completeTransferLoop() {
return false
default:
}
return err != shard.ErrShardClosed
}); err == shard.ErrShardClosed {
return !shard.IsShardOwnershipLostError(err)
}); shard.IsShardOwnershipLostError(err) {
// shard is unloaded, transfer processor should quit as well
t.Stop()
return
Expand Down
5 changes: 2 additions & 3 deletions service/history/visibilityQueueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
package history

import (
"errors"
"sync/atomic"
"time"

Expand Down Expand Up @@ -277,8 +276,8 @@ func (t *visibilityQueueProcessorImpl) completeTaskLoop() {
return false
default:
}
return err != shard.ErrShardClosed
}); errors.Is(err, shard.ErrShardClosed) {
return !shard.IsShardOwnershipLostError(err)
}); shard.IsShardOwnershipLostError(err) {
// shard closed, trigger shutdown and bail out
t.Stop()
return
Expand Down