Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use force detach as backoff when disk detach failed #2211

Merged
merged 2 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.21

require (
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.2
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.10.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.5.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0
Expand Down Expand Up @@ -39,9 +39,9 @@ require (
k8s.io/mount-utils v0.29.2
k8s.io/pod-security-admission v0.0.0
k8s.io/utils v0.0.0-20231127182322-b307cd553661
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240221180107-c5bbfe811391
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240221180107-c5bbfe811391
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240221180107-c5bbfe811391
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240303182306-a20684d9eb38
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240303182306-a20684d9eb38
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240303182306-a20684d9eb38
sigs.k8s.io/yaml v1.4.0
)

Expand Down Expand Up @@ -130,7 +130,7 @@ require (
go.opentelemetry.io/proto/otlp v1.1.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.19.0 // indirect
golang.org/x/crypto v0.20.0 // indirect
golang.org/x/exp v0.0.0-20231214170342-aacd6d4b4611 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/oauth2 v0.16.0 // indirect
Expand Down
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,8 @@ gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zum
git.sr.ht/~sbinet/gg v0.3.1/go.mod h1:KGYtlADtqsqANL9ueOFkWymvzUvLMQllU5Ixo+8v3pc=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.2 h1:c4k2FIYIh4xtwqrQwV0Ct1v5+ehlNXj5NI/MWVsiTkQ=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.2/go.mod h1:5FDJtLEO/GxwNgUxbwrY3LP0pEoThTQJtk2oysdXHxM=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.10.0 h1:n1DH8TPV4qqPTje2RcUBYwtrTWlabVp4n46+74X2pn4=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.10.0/go.mod h1:HDcZnuGbiyppErN6lB+idp4CKhjbc8gwjto6OPpyggM=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.5.1 h1:sO0/P7g68FrryJzljemN+6GTssUXdANk6aJ7T1ZxnsQ=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.5.1/go.mod h1:h8hyGFDsU5HMivxiS2iYFZsgDbU9OnnJ163x5UGVKYo=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.2 h1:LqbJ/WzJUwBf8UiaSzgX7aMclParm9/5Vgp+TY51uBQ=
Expand Down Expand Up @@ -1214,8 +1214,8 @@ golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0
golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio=
golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.20.0 h1:jmAMJJZXr5KiCw05dfYK9QnqaqKLYXijU23lsEdcQqg=
golang.org/x/crypto v0.20.0/go.mod h1:Xwo95rrVNIoSMx9wa1JroENMToLWn3RNVrTBpLHgZPQ=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
Expand Down Expand Up @@ -2035,12 +2035,12 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 h1:TgtAeesdhpm2SGwkQasmbeqDo8th5wOBA5h/AjTKA4I=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0/go.mod h1:VHVDI/KrK4fjnV61bE2g3sA7tiETLn8sooImelsCx3Y=
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240221180107-c5bbfe811391 h1:kEj4iFIYaCReKrmf7ukOV8xI8AKctwpSugs3JJ6myBw=
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240221180107-c5bbfe811391/go.mod h1:umKbPjCVRbMKDmPKRVlKfjICfIfF0AtwFWXXphEJMmk=
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240221180107-c5bbfe811391 h1:0U5j4Oa0pPJUwJ8mRlwyW7IovSPu3tGWiqExn+iJJGE=
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240221180107-c5bbfe811391/go.mod h1:cyVxBbpj/UR4Vi4KhxO3vhzBB2JcN08TngoPQ0J0aWc=
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240221180107-c5bbfe811391 h1:63qVeJG7eL/v8T+nM1zm2MysXH/g/0o0jyff+QAe5HQ=
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240221180107-c5bbfe811391/go.mod h1:8Y2dnp0FlQcoFllRnUgTZjMykhMAUF7NrPLgw30Ae4I=
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240303182306-a20684d9eb38 h1:ErE4z3owUFDSD0WwtZSnlIOWMWKgzwQYRpgg2VJj/w0=
sigs.k8s.io/cloud-provider-azure v1.29.1-0.20240303182306-a20684d9eb38/go.mod h1:d/ZH6rj+vQR2XQs8vto7fzfBVfv53q2zgAVqik4w1+Y=
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240303182306-a20684d9eb38 h1:eMXW6P7wNSAAioz0DtO6YUwV/IiQ5/vukFOnvVc5JQE=
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240303182306-a20684d9eb38/go.mod h1:wst9Z0P3nkKbhg/+Y9K1ku8Fue0j6TZJdfa5w8cQ4dE=
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240303182306-a20684d9eb38 h1:1dFLHc+l6W/g5JdpsyVOkpvu+gIp98Obo9bCK6Jh31g=
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20240303182306-a20684d9eb38/go.mod h1:tN1tg2D4Z1nfmiYoSNmIFtGPT6DjB9UyQ7tGuBLb+SI=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=
Expand Down
8 changes: 7 additions & 1 deletion pkg/azuredisk/azure_controller_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"k8s.io/klog/v2"
"k8s.io/utils/pointer"

"sigs.k8s.io/azuredisk-csi-driver/pkg/azureutils"
"sigs.k8s.io/cloud-provider-azure/pkg/azclient"
azcache "sigs.k8s.io/cloud-provider-azure/pkg/cache"
"sigs.k8s.io/cloud-provider-azure/pkg/consts"
Expand Down Expand Up @@ -100,6 +101,7 @@ type controllerCommon struct {
DisableDiskLunCheck bool
// AttachDetachInitialDelayInMs determines initial delay in milliseconds for batch disk attach/detach
AttachDetachInitialDelayInMs int
ForceDetachBackoff bool
}

// ExtendedLocation contains additional info about the location of resources.
Expand Down Expand Up @@ -344,13 +346,17 @@ func (c *controllerCommon) DetachDisk(ctx context.Context, diskName, diskURI str
if len(diskMap) > 0 {
c.diskStateMap.Store(disk, "detaching")
defer c.diskStateMap.Delete(disk)
if err = vmset.DetachDisk(ctx, nodeName, diskMap); err != nil {
if err = vmset.DetachDisk(ctx, nodeName, diskMap, false); err != nil {
if isInstanceNotFoundError(err) {
// if host doesn't exist, no need to detach
klog.Warningf("azureDisk - got InstanceNotFoundError(%v), DetachDisk(%s) will assume disk is already detached",
err, diskURI)
return nil
}
if c.ForceDetachBackoff && !azureutils.IsThrottlingError(err) {
klog.Errorf("azureDisk - DetachDisk(%s) from node %s failed with error: %v, retry with force detach", diskURI, nodeName, err)
err = vmset.DetachDisk(ctx, nodeName, diskMap, true)
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/azuredisk/azuredisk.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ type DriverCore struct {
enableOtelTracing bool
shouldWaitForSnapshotReady bool
checkDiskLUNCollision bool
forceDetachBackoff bool
endpoint string
disableAVSetNodes bool
kubeClient kubernetes.Interface
Expand Down Expand Up @@ -147,6 +148,7 @@ func newDriverV1(options *DriverOptions) *Driver {
driver.enableOtelTracing = options.EnableOtelTracing
driver.shouldWaitForSnapshotReady = options.WaitForSnapshotReady
driver.checkDiskLUNCollision = options.CheckDiskLUNCollision
driver.forceDetachBackoff = options.ForceDetachBackoff
driver.endpoint = options.Endpoint
driver.disableAVSetNodes = options.DisableAVSetNodes
driver.volumeLocks = volumehelper.NewVolumeLocks()
Expand Down Expand Up @@ -185,6 +187,7 @@ func newDriverV1(options *DriverOptions) *Driver {
driver.diskController = NewManagedDiskController(driver.cloud)
driver.diskController.DisableUpdateCache = driver.disableUpdateCache
driver.diskController.AttachDetachInitialDelayInMs = int(driver.attachDetachInitialDelayInMs)
driver.diskController.ForceDetachBackoff = driver.forceDetachBackoff
driver.clientFactory = driver.cloud.ComputeClientFactory
if driver.vmType != "" {
klog.V(2).Infof("override VMType(%s) in cloud config as %s", driver.cloud.VMType, driver.vmType)
Expand Down
2 changes: 2 additions & 0 deletions pkg/azuredisk/azuredisk_option.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ type DriverOptions struct {
GetNodeIDFromIMDS bool
WaitForSnapshotReady bool
CheckDiskLUNCollision bool
ForceDetachBackoff bool
Kubeconfig string
Endpoint string
DisableAVSetNodes bool
Expand Down Expand Up @@ -94,6 +95,7 @@ func (o *DriverOptions) AddFlags() *flag.FlagSet {
fs.BoolVar(&o.GetNodeIDFromIMDS, "get-nodeid-from-imds", false, "boolean flag to get NodeID from IMDS")
fs.BoolVar(&o.WaitForSnapshotReady, "wait-for-snapshot-ready", true, "boolean flag to wait for snapshot ready when creating snapshot in same region")
fs.BoolVar(&o.CheckDiskLUNCollision, "check-disk-lun-collision", true, "boolean flag to check disk lun collisio before attaching disk")
fs.BoolVar(&o.ForceDetachBackoff, "force-detach-backoff", true, "boolean flag to force detach in disk detach backoff")
fs.StringVar(&o.Kubeconfig, "kubeconfig", "", "Absolute path to the kubeconfig file. Required only when running out of cluster.")
fs.BoolVar(&o.DisableAVSetNodes, "disable-avset-nodes", false, "disable DisableAvailabilitySetNodes in cloud config for controller")
fs.StringVar(&o.Endpoint, "endpoint", "unix://tmp/csi.sock", "CSI endpoint")
Expand Down
4 changes: 2 additions & 2 deletions pkg/azuredisk/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
lockMap: newLockMap(),
DisableDiskLunCheck: true,
clientFactory: localCloud.ComputeClientFactory,
ForceDetachBackoff: d.forceDetachBackoff,
},
}
localDiskController.DisableUpdateCache = d.disableUpdateCache
Expand Down Expand Up @@ -443,8 +444,7 @@ func (d *Driver) ControllerPublishVolume(ctx context.Context, req *csi.Controlle
// Volume is already attached to node.
klog.V(2).Infof("Attach operation is successful. volume %s is already attached to node %s at lun %d.", diskURI, nodeName, lun)
} else {
if strings.Contains(strings.ToLower(err.Error()), strings.ToLower(consts.TooManyRequests)) ||
strings.Contains(strings.ToLower(err.Error()), consts.ClientThrottled) {
if azureutils.IsThrottlingError(err) {
return nil, status.Errorf(codes.Internal, err.Error())
}
var cachingMode armcompute.CachingTypes
Expand Down
3 changes: 1 addition & 2 deletions pkg/azuredisk/controllerserver_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,7 @@ func (d *DriverV2) ControllerPublishVolume(ctx context.Context, req *csi.Control
// Volume is already attached to node.
klog.V(2).Infof("Attach operation is successful. volume %s is already attached to node %s at lun %d.", diskURI, nodeName, lun)
} else {
if strings.Contains(strings.ToLower(err.Error()), strings.ToLower(consts.TooManyRequests)) ||
strings.Contains(strings.ToLower(err.Error()), consts.ClientThrottled) {
if azureutils.IsThrottlingError(err) {
return nil, status.Errorf(codes.Internal, err.Error())
}
var cachingMode armcompute.CachingTypes
Expand Down
10 changes: 9 additions & 1 deletion pkg/azureutils/azure_disk_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ func InsertDiskProperties(disk *armcompute.Disk, publishConext map[string]string
}

func SleepIfThrottled(err error, defaultSleepSec int) {
if err != nil && strings.Contains(strings.ToLower(err.Error()), strings.ToLower(consts.TooManyRequests)) || strings.Contains(strings.ToLower(err.Error()), consts.ClientThrottled) {
if err != nil && IsThrottlingError(err) {
retryAfter := getRetryAfterSeconds(err)
if retryAfter == 0 {
retryAfter = defaultSleepSec
Expand All @@ -768,6 +768,14 @@ func SleepIfThrottled(err error, defaultSleepSec int) {
}
}

func IsThrottlingError(err error) bool {
if err != nil {
errMsg := strings.ToLower(err.Error())
return strings.Contains(errMsg, strings.ToLower(consts.TooManyRequests)) || strings.Contains(errMsg, consts.ClientThrottled)
}
return false
}

// getRetryAfterSeconds returns the number of seconds to wait from the error message
func getRetryAfterSeconds(err error) int {
if err == nil {
Expand Down
42 changes: 42 additions & 0 deletions pkg/azureutils/azure_disk_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1902,3 +1902,45 @@ func TestGetRetryAfterSeconds(t *testing.T) {
}
}
}

func TestIsThrottlingError(t *testing.T) {
tests := []struct {
desc string
err error
expected bool
}{
{
desc: "nil error",
err: nil,
expected: false,
},
{
desc: "no match",
err: errors.New("no match"),
expected: false,
},
{
desc: "match error message",
err: errors.New("could not list storage accounts for account type Premium_LRS: Retriable: true, RetryAfter: 217s, HTTPStatusCode: 0, RawError: azure cloud provider throttled for operation StorageAccountListByResourceGroup with reason \"client throttled\""),
expected: true,
},
{
desc: "match error message exceeds 1200s",
err: errors.New("could not list storage accounts for account type Premium_LRS: Retriable: true, RetryAfter: 2170s, HTTPStatusCode: 0, RawError: azure cloud provider throttled for operation StorageAccountListByResourceGroup with reason \"client throttled\""),
expected: true,
},
{
desc: "match error message with TooManyRequests throttling",
err: errors.New("could not list storage accounts for account type Premium_LRS: Retriable: true, RetryAfter: 2170s, HTTPStatusCode: 429, RawError: azure cloud provider throttled for operation StorageAccountListByResourceGroup with reason \"TooManyRequests\""),
expected: true,
},
}

for _, test := range tests {
result := IsThrottlingError(test.err)
if result != test.expected {
t.Errorf("desc: (%s), input: err(%v), IsThrottlingError returned with bool(%t), not equal to expected(%t)",
test.desc, test.err, result, test.expected)
}
}
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading