Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

retry on network failure for detecting EC2 #397

Merged
merged 4 commits into from
Mar 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packaging/dependencies/amazon-cloudwatch-agent.service
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

[Unit]
Description=Amazon CloudWatch Agent
After=network-online.target
After=network.target

[Service]
Type=simple
Expand Down
61 changes: 47 additions & 14 deletions translator/util/ec2util/ec2util.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
package ec2util

import (
"log"
"sync"

"github.com/aws/amazon-cloudwatch-agent/translator/config"
"github.com/aws/amazon-cloudwatch-agent/translator/context"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
"github.com/aws/aws-sdk-go/aws/session"
"log"
"net"
"sync"
"time"
)

// this is a singleton struct
Expand All @@ -21,6 +22,8 @@ type ec2Util struct {
Hostname string
}

const allowedRetries = 5

var e *ec2Util
var once sync.Once

Expand All @@ -37,39 +40,69 @@ func initEC2UtilSingleton() (newInstance *ec2Util) {
return
}

ses, e := session.NewSession()
if e != nil {
log.Println("E! [EC2] getting new session info: ", e)
// Need to account for the scenario where a user running the CloudWatch agent on-premises,
// and doesn't require connectivity with the EC2 instance metadata service, while still
// gracefully waiting for network access on EC2 instances.
networkUp := false
for retry := 0; !networkUp && retry < allowedRetries; retry++ {
ifs, err := net.Interfaces()

if err != nil {
log.Println("E! [EC2] An error occurred while fetching network interfaces: ", err)
}

for _, in := range ifs {
if (in.Flags&net.FlagUp) != 0 && (in.Flags&net.FlagLoopback) == 0 {
SaxyPandaBear marked this conversation as resolved.
Show resolved Hide resolved
networkUp = true
break
}
}
if networkUp {
log.Println("D! [EC2] Found active network interface")
break
}

log.Println("W! [EC2] Sleep until network is up")
time.Sleep(1 * time.Second)
}
if !networkUp {
log.Println("E! [EC2] No available network interface")
}

ses, err := session.NewSession()
if err != nil {
log.Println("E! [EC2] getting new session info: ", err)
return
}
md := ec2metadata.New(ses)

if !md.Available() {
log.Println("E! ec2metadata is not available")
return
}

if info, e := md.GetMetadata("instance-id"); e == nil {
if info, err := md.GetMetadata("instance-id"); err == nil {
newInstance.InstanceID = info
} else {
log.Println("E! getting instance-id from EC2 metadata fail: ", e)
log.Println("E! getting instance-id from EC2 metadata fail: ", err)
}

if info, e := md.GetMetadata("hostname"); e == nil {
if info, err := md.GetMetadata("hostname"); err == nil {
newInstance.Hostname = info
} else {
log.Println("E! getting hostname from EC2 metadata fail: ", e)
log.Println("E! getting hostname from EC2 metadata fail: ", err)
}

if info, e := md.GetMetadata("local-ipv4"); e == nil {
if info, err := md.GetMetadata("local-ipv4"); err == nil {
newInstance.PrivateIP = info
} else {
log.Println("E! getting local-ipv4 from EC2 metadata fail: ", e)
log.Println("E! getting local-ipv4 from EC2 metadata fail: ", err)
}

if info, e := md.GetInstanceIdentityDocument(); e == nil {
if info, err := md.GetInstanceIdentityDocument(); err == nil {
newInstance.Region = info.Region
} else {
log.Println("E! getting region from EC2 metadata fail: ", e)
log.Println("E! getting region from EC2 metadata fail: ", err)
}

return
Expand Down