Skip to content

Commit

Permalink
awscloud: add very verbose logging to createFleet creation
Browse files Browse the repository at this point in the history
We still see this error sometimes:

Unable to start secure instance: Unable to create fleet: InsufficientInstanceCapacity: There is no Spot capacity available that matches your request

This is awkward because the message mentions that there is no spot
capacity, even though the current code should retry on
InsufficientInstanceCapacity. I also confirmed this by searching for
the retries log messages: there are none in the logs.

We need a bigger hammer. Let's log everything that happens in the
createFleet method in order to have better understanding why the
retry logic isn't triggered. We should probably move most of the newly
added logs to the debug level, but let's delay that until we have
more insight into what's happening.
  • Loading branch information
ondrejbudai authored and croissanne committed Nov 26, 2024
1 parent 54ffc08 commit 64ff0e3
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions internal/cloud/awscloud/secure-instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package awscloud
import (
"context"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"slices"
Expand Down Expand Up @@ -592,6 +593,7 @@ func (a *AWS) deleteSGIfExists(si *SecureInstance) error {
}

func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, error) {
logCreateFleetInput(input)
createFleetOutput, err := a.ec2.CreateFleet(context.Background(), input)
if err != nil {
return createFleetOutput, fmt.Errorf("Unable to create spot fleet: %w", err)
Expand All @@ -602,20 +604,26 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
logrus.Warnf("Received errors (%s) from CreateFleet, retrying CreateFleet with OnDemand instance", strings.Join(fleetErrs, "; "))
input.SpotOptions = nil
input.TargetCapacitySpecification.DefaultTargetCapacityType = ec2types.DefaultTargetCapacityTypeOnDemand
logCreateFleetInput(input)
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
if err != nil {
return createFleetOutput, fmt.Errorf("Unable to create on demand fleet: %w", err)
}
} else {
logrus.Infof("Won't retry CreateFleet with OnDemand instance, retry: %v, errors: %s", retry, strings.Join(fleetErrs, "; "))
}

retry, fleetErrs = doCreateFleetRetry(createFleetOutput)
if len(fleetErrs) > 0 && retry {
logrus.Warnf("Received errors (%s) from CreateFleet with OnDemand instance option, retrying across availability zones", strings.Join(fleetErrs, "; "))
input.LaunchTemplateConfigs[0].Overrides = nil
logCreateFleetInput(input)
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
if err != nil {
return createFleetOutput, fmt.Errorf("Unable to create on demand fleet across AZs: %w", err)
}
} else {
logrus.Infof("Won't retry CreateFleet across AZs, retry: %v, errors: %s", retry, strings.Join(fleetErrs, "; "))
}

if len(createFleetOutput.Errors) > 0 {
Expand Down Expand Up @@ -650,15 +658,26 @@ func doCreateFleetRetry(cfOutput *ec2.CreateFleetOutput) (bool, []string) {
logrus.Infof("Checking to retry fleet create on error %s (msg: %s)", *err.ErrorCode, *err.ErrorMessage)
if slices.Contains(retryCodes, *err.ErrorCode) {
retry = true
logrus.Infof("doCreateFleetRetry: setting retry to true")
}
msg = append(msg, fmt.Sprintf("%s: %s", *err.ErrorCode, *err.ErrorMessage))
}

// Do not retry in case an instance already exists, in that case just fail and let the worker terminate the SI
if len(cfOutput.Instances) > 0 && len(cfOutput.Instances[0].InstanceIds) > 0 {
logrus.Infof("doCreateFleetRetry: cancelling retry, instance already exists: %s", cfOutput.Instances[0].InstanceIds)
retry = false
msg = append(msg, fmt.Sprintf("Already launched instance (%s), aborting create fleet", cfOutput.Instances[0].InstanceIds))
}

logrus.Infof("doCreateFleetRetry: returning retry: %v, msg: %v", retry, msg)
return retry, msg
}

func logCreateFleetInput(input *ec2.CreateFleetInput) {
if inputJSON, err := json.Marshal(input); err != nil {
logrus.Warnf("Unable to marshal input for logging: %v", input)
} else {
logrus.Infof("Creating fleet with input: %s", inputJSON)
}
}

0 comments on commit 64ff0e3

Please sign in to comment.