Skip to content

Commit

Permalink
update branch ENI operation metrics & dev guide (aws#465)
Browse files Browse the repository at this point in the history
  • Loading branch information
sushrk committed Oct 25, 2024
1 parent bbad908 commit f09b02f
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 23 deletions.
8 changes: 6 additions & 2 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ make toolchain # Install required to develop the project

## Testing a code change

Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account
for ENI trunking before the deployment.
Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account for ENI trunking before the deployment.

If you are testing on EKS beta cluster, set
```sh
BETA_CLUSTER=true
```

```sh
make apply-dependencies # install the cert manager and certificate
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ GOLANG_VERSION ?= $(shell cat .go-version)
BUILD_IMAGE ?= public.ecr.aws/docker/library/golang:$(GOLANG_VERSION)
GOARCH ?= amd64
PLATFORM ?= linux/amd64
USER_ROLE_ARN ?= arn:aws:iam::$(AWS_ACCOUNT):role/VPCResourceControllerRole
BETA_CLUSTER ?= false

help: ## Display help
@awk 'BEGIN {FS = ":.*##"; printf "Usage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
Expand Down Expand Up @@ -51,14 +53,19 @@ toolchain: ## Install developer toolchain
./hack/toolchain.sh

apply: image check-deployment-env check-env ## Deploy controller to ~/.kube/config
ifeq ($(BETA_CLUSTER), true)
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --endpoint https://api.beta.us-west-2.wesley.amazonaws.com --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
else
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
endif
eksctl create iamserviceaccount vpc-resource-controller --namespace kube-system --cluster ${CLUSTER_NAME} --region ${AWS_REGION} \
--role-name VPCResourceControllerRole \
--attach-policy-arn=arn:aws:iam::aws:policy/AdministratorAccess \
--override-existing-serviceaccounts \
--approve
kustomize build config/crd | kubectl apply -f -
cd config/controller && kustomize edit set image controller=${IMAGE}
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g" | kubectl apply -f -
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g;s|VPC_ID|${VPC_ID}|g" | kubectl apply -f -
kubectl patch rolebinding eks-vpc-resource-controller-rolebinding -n kube-system --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'
kubectl patch clusterrolebinding vpc-resource-controller-rolebinding --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'

Expand Down
2 changes: 2 additions & 0 deletions config/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ spec:
- --role-arn=USER_ROLE_ARN
- --leader-elect
- --metrics-bind-address=:8443
- --introspect-bind-addr=:22775
- --vpc-id=VPC_ID
image: controller:latest
name: controller
resources:
Expand Down
42 changes: 23 additions & 19 deletions pkg/provider/branch/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,44 +45,47 @@ import (
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
operationCreateBranchENI = "create_branch_eni"
operationAnnotateBranchENI = "annotate_branch_eni"
operationInitTrunk = "init_trunk"
resourceCountLabel = "resource_count"
operationLabel = "branch_provider_operation"

ReasonSecurityGroupRequested = "SecurityGroupRequested"
ReasonResourceAllocated = "ResourceAllocated"
ReasonBranchAllocationFailed = "BranchAllocationFailed"
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"

ReasonTrunkENICreationFailed = "TrunkENICreationFailed"
)

var (
branchProviderOperationsErrCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "branch_provider_operations_err_count",
Help: "The number of errors encountered for branch provider operations",
},
[]string{"operation"},
[]string{operationLabel},
)

branchProviderOperationLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "branch_provider_operation_latency",
Help: "Branch Provider operations latency in ms",
Name: "branch_provider_operation_latency",
Help: "Branch Provider operations latency in ms",
Objectives: map[float64]float64{0: 0, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001, 1: 0},
},
[]string{"operation", "resource_count"},
[]string{operationLabel, resourceCountLabel},
)

operationCreateBranchENI = "create_branch_eni"
operationCreateBranchENIAndAnnotate = "create_and_annotate_branch_eni"
operationInitTrunk = "init_trunk"

ReasonSecurityGroupRequested = "SecurityGroupRequested"
ReasonResourceAllocated = "ResourceAllocated"
ReasonBranchAllocationFailed = "BranchAllocationFailed"
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"

ReasonTrunkENICreationFailed = "TrunkENICreationFailed"

deleteQueueRequeueRequest = ctrl.Result{RequeueAfter: time.Second * 30, Requeue: true}

// NodeDeleteRequeueRequestDelay represents the time after which the resources belonging to a node will be cleaned
// up after receiving the actual node delete event.
NodeDeleteRequeueRequestDelay = time.Minute * 5

prometheusRegistered = false
)

var (
ErrTrunkExistInCache = fmt.Errorf("trunk eni already exist in cache")
ErrTrunkNotInCache = fmt.Errorf("trunk eni not present in cache")
)
Expand Down Expand Up @@ -172,7 +175,7 @@ func (b *branchENIProvider) InitResource(instance ec2.EC2Instance) error {

utils.SendNodeEventWithNodeName(b.apiWrapper.K8sAPI, nodeName, utils.NodeTrunkFailedInitializationReason, "The node failed initializing trunk interface", v1.EventTypeNormal, b.log)
branchProviderOperationsErrCount.WithLabelValues("init").Inc()
return fmt.Errorf("initalizing trunk, %w", err)
return fmt.Errorf("initializing trunk, %w", err)
}
branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceMs(start))

Expand Down Expand Up @@ -377,6 +380,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
return ctrl.Result{}, err
}

start = time.Now()
// Annotate the pod with the created resources
err = b.apiWrapper.PodAPI.AnnotatePod(pod.Namespace, pod.Name, pod.UID,
config.ResourceNamePodENI, string(jsonBytes))
Expand All @@ -393,7 +397,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
b.apiWrapper.K8sAPI.BroadcastEvent(pod, ReasonResourceAllocated,
fmt.Sprintf("Allocated %s to the pod", string(jsonBytes)), v1.EventTypeNormal)

branchProviderOperationLatency.WithLabelValues(operationCreateBranchENIAndAnnotate, strconv.Itoa(resourceCount)).
branchProviderOperationLatency.WithLabelValues(operationAnnotateBranchENI, strconv.Itoa(resourceCount)).
Observe(timeSinceMs(start))

log.Info("created and annotated branch interface/s successfully", "branches", branchENIs)
Expand Down
2 changes: 1 addition & 1 deletion scripts/test/lib/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function add_suffix() {

# IAM Role Name for Linux Node Role where VPC Resource Controller Runs. It should
# have the Trunk Association Policy
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfcePolicy")
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfacePolicy")
INSTANCE_ROLE_NAME=$(add_suffix "LinuxNodeRole")

# IAM Role and it's Policy Names which have the permission to manage Trunk/Branch
Expand Down

0 comments on commit f09b02f

Please sign in to comment.