Skip to content

Commit

Permalink
DLPX-87473 stress.reboot.test_repeated_reboots failed with crash dump…
Browse files Browse the repository at this point in the history
… on OCI (openzfs#1129)

The problem is the same as DLPX-87209
(https://github.com/delphix/zfs/pull/1081).  The fix for that had no
effect, because it modified the local variable `client` which is not
used.  We need to instead modify the client that has already been set in
the `clientInfo`.  Additionally, we need to disable "eventual
consistency" so that our "ShouldRetryOperation" is actually used.  This
is safe because OCI's object storage is actually fully consistent
according to the docs
(https://docs.oracle.com/en-us/iaas/Content/Object/Concepts/objectstorageoverview.htm)

Also restructure the code so that the local variable is not in scope
after it's copied to the ClientInfo.
  • Loading branch information
ahrens authored Aug 21, 2023
1 parent c344496 commit 36ef106
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions lib/liboci_sdk/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,20 +215,18 @@ func FatalIfError(err error) {
}
}

//export NewOCIClient
func NewOCIClient() C.uint {

func NewOCIClientInfo() ClientInfo {
// Use an instance principal provider
provider, err := auth.InstancePrincipalConfigurationProvider()
log.Info("provider: ", provider)
FatalIfError(err)

client, err := objectstorage.NewObjectStorageClientWithConfigurationProvider(provider)
log.Info("client: ", client)
FatalIfError(err)
return ClientInfo{client, atomic.Uint64{}}
}

// store client in a global map (we can't pass back structs with embedded pointers)
clientInfo := ClientInfo{client, atomic.Uint64{}}
//export NewOCIClient
func NewOCIClient() C.uint {
clientInfo := NewOCIClientInfo()

retryCountRateLimitErrors := func(r common.OCIOperationResponse) bool {
if r.Error == nil && r.Response.HTTPResponse().StatusCode == 429 {
Expand All @@ -240,19 +238,23 @@ func NewOCIClient() C.uint {
// we simply retry all network failures, obviously ephemeral or otherwise, so rather than
// trying to patch individual failures as they occur, we adopt that approach here as well.
if r.Error != nil {
log.Debug("ShouldRetry error: ", r.Error)
if _, ok := r.Error.(net.Error); ok {
return true
}
}
return common.DefaultShouldRetryOperation(r)
}
retryPolicy := common.NewRetryPolicyWithOptions(
common.WithUnlimitedAttempts(time.Duration(24*time.Hour)),
common.WithExponentialBackoff(time.Duration(10*time.Second), 2),
common.WithShouldRetryOperation(retryCountRateLimitErrors),
)
client.SetCustomClientConfiguration(common.CustomClientConfiguration{RetryPolicy: &retryPolicy})
retryPolicy := common.DefaultRetryPolicyWithoutEventualConsistency()
common.WithUnlimitedAttempts(time.Duration(24 * time.Hour))(&retryPolicy)
common.WithExponentialBackoff(time.Duration(10*time.Second), 2)(&retryPolicy)
common.WithShouldRetryOperation(retryCountRateLimitErrors)(&retryPolicy)

clientInfo.client.SetCustomClientConfiguration(common.CustomClientConfiguration{
RetryPolicy: &retryPolicy,
})

// store client in a global map (we can't pass back structs with embedded pointers)
cid := globalState.Insert(&clientInfo)
log.Info("NewOCIClient() ->", cid)
log.Info("GOMAXPROCS():", runtime.GOMAXPROCS(-1))
Expand Down

0 comments on commit 36ef106

Please sign in to comment.