diff --git a/ipamd/datastore/data_store.go b/ipamd/datastore/data_store.go index 05e572b737..d83eb8a1a6 100644 --- a/ipamd/datastore/data_store.go +++ b/ipamd/datastore/data_store.go @@ -50,7 +50,7 @@ var ( enis = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "eni_allocated", - Help: "The number of ENI allocated", + Help: "The number of ENIs allocated", }, ) totalIPs = prometheus.NewGauge( @@ -62,7 +62,7 @@ var ( assignedIPs = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "assigned_ip_addresses", - Help: "The number of IP addresses assigned", + Help: "The number of IP addresses assigned to pods", }, ) prometheusRegistered = false diff --git a/ipamd/ipamd.go b/ipamd/ipamd.go index e15f30c285..08b72f6551 100644 --- a/ipamd/ipamd.go +++ b/ipamd/ipamd.go @@ -15,6 +15,8 @@ package ipamd import ( "net" + "os" + "strconv" "strings" "time" @@ -40,27 +42,34 @@ const ( ipPoolMonitorInterval = 5 * time.Second maxRetryCheckENI = 5 eniAttachTime = 10 * time.Second + defaultWarmENITarget = 1 ) var ( ipamdErr = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "ipamd_error_count", - Help: "the number of errors encountered in ipamd", + Help: "The number of errors encountered in ipamd", }, []string{"fn", "error"}, ) ipamdActionsInprogress = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "ipamd_action_inprogress", - Help: "the number of ipamd actions inprogress", + Help: "The number of ipamd actions inprogress", }, []string{"fn"}, ) enisMax = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "eni_max", - Help: "The number of maximum ENIs can be attached to the instance", + Help: "The maximum number of ENIs that can be attached to the instance", + }, + ) + ipMax = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "ip_max", + Help: "The maximum number of IP addresses that can be allocated to the instance", }, ) prometheusRegistered = false @@ -86,6 +95,7 @@ func prometheusRegister() { prometheus.MustRegister(ipamdErr) prometheus.MustRegister(ipamdActionsInprogress) prometheus.MustRegister(enisMax) + prometheus.MustRegister(ipMax) prometheusRegistered = true } } @@ -117,8 +127,16 @@ func New() (*IPAMContext, error) { //TODO need to break this function down(comments from CR) func (c *IPAMContext) nodeInit() error { + ipamdActionsInprogress.WithLabelValues("nodeInit").Add(float64(1)) + defer ipamdActionsInprogress.WithLabelValues("nodeInit").Sub(float64(1)) maxENIs, err := c.awsClient.GetENILimit() - enisMax.Set(float64(maxENIs)) + if err == nil { + enisMax.Set(float64(maxENIs)) + } + maxIPs, err := c.awsClient.GetENIipLimit() + if err == nil { + ipMax.Set(float64(maxIPs * int64(maxENIs))) + } enis, err := c.awsClient.GetAttachedENIs() if err != nil { log.Error("Failed to retrive ENI info") @@ -240,8 +258,8 @@ func (c *IPAMContext) decreaseIPPool() { log.Debugf("Start freeing eni %s", eni) c.awsClient.FreeENI(eni) total, used := c.dataStore.GetStats() - log.Debugf("Successfully decreased IP Pool: total=%d, used=%d, c.currentMaxAddrsPerENI =%d, c.maxAddrsPerENI = %d", - total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) + log.Debugf("Successfully decreased IP Pool") + logPoolStats(total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) } func isAttachmentLimitExceededError(err error) bool { @@ -260,6 +278,10 @@ func (c *IPAMContext) increaseIPPool() { return } if (c.maxENI > 0) && (c.maxENI == c.dataStore.GetENIs()) { + if c.maxENI < maxENIs { + errString := "desired: " + strconv.FormatInt(int64(maxENIs), 10) + "current: " + strconv.FormatInt(int64(c.maxENI), 10) + ipamdErrInc("unExpectedMaxENIAttached", errors.New(errString)) + } log.Debugf("Skipping increase IPPOOL due to max ENI already attached to the instance : %d", c.maxENI) return } @@ -297,8 +319,8 @@ func (c *IPAMContext) increaseIPPool() { return } total, used := c.dataStore.GetStats() - log.Debugf("Successfully increased IP Pool: total=%d, used=%d, c.currentMaxAddrsPerENI =%d, c.maxAddrsPerENI = %d", - total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) + log.Debugf("Successfully increased IP Pool") + logPoolStats(total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) } // setupENI does following: @@ -400,23 +422,46 @@ func (c *IPAMContext) waitENIAttached(eni string) (awsutils.ENIMetadata, error) } } +func getWarmENITarget() int { + inputStr, found := os.LookupEnv("WARM_ENI_TARGET") + + if !found { + return defaultWarmENITarget + } + + if input, err := strconv.Atoi(inputStr); err == nil { + if input < 0 { + return defaultWarmENITarget + } + log.Debugf("Using WARM-ENI-TARGET %v", input) + return input + } + return defaultWarmENITarget +} + +func logPoolStats(total, used, currentMaxAddrsPerENI, maxAddrsPerENI int) { + log.Debugf("IP pool stats: total = %d, used = %d, c.currentMaxAddrsPerENI = %d, c.maxAddrsPerENI = %d", + total, used, currentMaxAddrsPerENI, maxAddrsPerENI) +} + //nodeIPPoolTooLow returns true if IP pool is below low threshhold func (c *IPAMContext) nodeIPPoolTooLow() bool { + warmENITarget := getWarmENITarget() total, used := c.dataStore.GetStats() - log.Debugf("IP pool stats: total=%d, used=%d, c.currentMaxAddrsPerENI =%d, c.maxAddrsPerENI = %d", - total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) + logPoolStats(total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) - return ((total - used) <= c.currentMaxAddrsPerENI) + available := total - used + return (available <= c.currentMaxAddrsPerENI*warmENITarget) } // NodeIPPoolTooHigh returns true if IP pool is above high threshhold func (c *IPAMContext) nodeIPPoolTooHigh() bool { + warmENITarget := getWarmENITarget() total, used := c.dataStore.GetStats() + logPoolStats(total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) - log.Debugf("IP pool stats: total=%d, used=%d, c.currentMaxAddrsPerENI =%d, c.maxAddrsPerENI = %d", - total, used, c.currentMaxAddrsPerENI, c.maxAddrsPerENI) - - return (total-used > 2*c.currentMaxAddrsPerENI) + available := total - used + return (available > (warmENITarget+1)*c.currentMaxAddrsPerENI) } diff --git a/misc/cni_metrics_helper.yaml b/misc/cni_metrics_helper.yaml index e0c21fd107..05fdcff4ef 100644 --- a/misc/cni_metrics_helper.yaml +++ b/misc/cni_metrics_helper.yaml @@ -77,6 +77,9 @@ spec: spec: serviceAccountName: cni-metrics-helper containers: - - image: 694065802095.dkr.ecr.us-west-2.amazonaws.com/cni-metrics-helper:0.1.0 + - image: 694065802095.dkr.ecr.us-west-2.amazonaws.com/cni-metrics-helper:0.1.1 imagePullPolicy: Always name: cni-metrics-helper + env: + - name: USE_CLOUDWATCH + value: "no" diff --git a/pkg/awsutils/awsutils.go b/pkg/awsutils/awsutils.go index 1cf321b9f6..c54239131d 100644 --- a/pkg/awsutils/awsutils.go +++ b/pkg/awsutils/awsutils.go @@ -73,14 +73,14 @@ var ( awsAPIErr = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "aws_api_error_count", - Help: "the number of times AWS API returns an err", + Help: "The number of times AWS API returns an error", }, []string{"api", "error"}, ) awsUtilsErr = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "aws_utils_error_count", - Help: " the number of errors not handled in awsutils library", + Help: "The number of errors not handled in awsutils library", }, []string{"fn", "error"}, )