Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add clusterloader2 runs against immutable container hosts OS on AWS and Azure #503

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions jobs/competitive-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,3 @@ jobs:
regions: ${{ parameters.regions }}
engine_input: ${{ parameters.engine_input }}
credential_type: ${{ parameters.credential_type }}
- template: /steps/cleanup-resources.yml
parameters:
cloud: ${{ parameters.cloud }}
regions: ${{ parameters.regions }}
terraform_arguments: ${{ parameters.terraform_arguments }}
retry_attempt_count: ${{ parameters.retry_attempt_count }}
credential_type: ${{ parameters.credential_type }}
14 changes: 11 additions & 3 deletions modules/python/clusterloader2/cri/cri.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,15 @@ def override_config_clusterloader2(
print(f"Node {node.metadata.name} has allocatable cpu of {allocatable_cpu} and allocatable memory of {allocatable_memory}")

cpu_value = int(allocatable_cpu.replace("m", ""))
memory_value = int(allocatable_memory.replace("Ki", ""))
# Bottlerocket OS SKU on EKS has allocatable_memory property in Mi. AKS and Amazon Linux (default SKUs)
# user Ki. Handling the Mi case here and converting Mi to Ki, if needed.
if "Mi" in allocatable_memory:
memory_value = int(allocatable_memory.replace("Mi", "")) * 1024
elif "Ki" in allocatable_memory:
memory_value = int(allocatable_memory.replace("Ki", ""))
else:
raise Exception("Unexpected format of allocatable memory node property")

print(f"Node {node.metadata.name} has cpu value of {cpu_value} and memory value of {memory_value}")

allocated_cpu, allocated_memory = _get_daemonsets_pods_allocated_resources(client, node.metadata.name)
Expand Down Expand Up @@ -182,13 +190,13 @@ def main():
args = parser.parse_args()

if args.command == "override":
override_config_clusterloader2(args.node_count, args.node_per_step, args.max_pods, args.repeats, args.operation_timeout,
override_config_clusterloader2(args.node_count, args.node_per_step, args.max_pods, args.repeats, args.operation_timeout,
args.load_type, args.scale_enabled, args.pod_startup_latency_threshold,
args.provider, args.cl2_override_file)
elif args.command == "execute":
execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider)
elif args.command == "collect":
collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.load_type,
collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.load_type,
args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file)

if __name__ == "__main__":
Expand Down
13 changes: 13 additions & 0 deletions modules/terraform/azure/aks-cli/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ resource "terraform_data" "aks_cli" {
"--nodepool-name", var.aks_cli_config.default_node_pool.name,
"--node-count", var.aks_cli_config.default_node_pool.node_count,
"--node-vm-size", var.aks_cli_config.default_node_pool.vm_size,
length(var.aks_cli_config.default_node_pool.node_labels) == 0 ? "" : format("%s %s",
"--labels", join(" ", [
for label_name, label_value in var.aks_cli_config.default_node_pool.node_labels :
format("%s=%s", label_name, label_value)
])
),
"--vm-set-type", var.aks_cli_config.default_node_pool.vm_set_type,
local.optional_parameters,
local.subnet_id_parameter,
Expand Down Expand Up @@ -159,6 +165,13 @@ resource "terraform_data" "aks_nodepool_cli" {
"--nodepool-name", each.value.name,
"--node-count", each.value.node_count,
"--node-vm-size", each.value.vm_size,
local.aks_custom_headers_flags,
length(each.value.node_labels) == 0 ? "" : format("%s %s",
"--labels", join(" ", [
for label_name, label_value in each.value.node_labels :
format("%s=%s", label_name, label_value)
])
),
"--vm-set-type", each.value.vm_set_type,
])
}
Expand Down
2 changes: 2 additions & 0 deletions modules/terraform/azure/aks-cli/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ variable "aks_cli_config" {
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = optional(string, "VirtualMachineScaleSets")
})
extra_node_pool = optional(
list(object({
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = optional(string, "VirtualMachineScaleSets")
})), [])
optional_parameters = optional(list(object({
Expand Down
4 changes: 4 additions & 0 deletions modules/terraform/azure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ variable "json_input" {
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = string
}))
aks_cli_user_node_pool = optional(
list(object({
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = string
}))
)
Expand Down Expand Up @@ -217,13 +219,15 @@ variable "aks_cli_config_list" {
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = optional(string, "VirtualMachineScaleSets")
})
extra_node_pool = optional(
list(object({
name = string
node_count = number
vm_size = string
node_labels = optional(map(string), {})
vm_set_type = optional(string, "VirtualMachineScaleSets")
})), [])
optional_parameters = optional(list(object({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,30 @@ trigger: none

variables:
SCENARIO_TYPE: perf-eval
SCENARIO_NAME: cri-kbench-cp-bottlerocket
SCENARIO_NAME: cri-clusterloader2-immut-host
SCENARIO_VERSION: main

stages:
- stage: aws_westeurope
- stage: azure_swedencentral
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: aws
cloud: azure
regions:
- eu-west-1
- swedencentral
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241016"
topology: cri-kbench-cp
topology: cri-resource-consume
matrix:
n3-p300-memory:
node_count: 3
max_pods: 9
repeats: 1
operation_timeout: 3m
load_type: memory
n3-p300-cpu:
node_count: 3
max_pods: 9
repeats: 1
operation_timeout: 3m
load_type: cpu
max_parallel: 3
timeout_in_minutes: 120
max_parallel: 1
timeout_in_minutes: 240
credential_type: service_connection
ssh_key_enabled: false
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
scenario_type = "perf-eval"
scenario_name = "cri-kbench-cp-bottlerocket"
scenario_name = "cri-clusterloader2-immut-host"
deletion_delay = "2h"
owner = "aks"

Expand Down Expand Up @@ -84,9 +84,9 @@ eks_config_list = [{
name = "userpool0"
ami_type = "BOTTLEROCKET_x86_64"
instance_types = ["m5.4xlarge"]
min_size = 3
max_size = 3
desired_size = 3
min_size = 10
max_size = 10
desired_size = 10
capacity_type = "ON_DEMAND"
taints = [
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
scenario_type = "perf-eval"
scenario_name = "cri-clusterloader2-immut-host"
deletion_delay = "2h"
owner = "aks"

network_config_list = [
{
role = "client"
vnet_name = "cri-vnet"
vnet_address_space = "10.0.0.0/9"
subnet = [
{
name = "cri-subnet-1"
address_prefix = "10.0.0.0/16"
}
]
network_security_group_name = ""
nic_public_ip_associations = []
nsr_rules = []
}
]

aks_config_list = [
{
role = "client"
aks_name = "cri-resource-consume"
dns_prefix = "cri"
subnet_name = "cri-vnet"
sku_tier = "Standard"
network_profile = {
network_plugin = "azure"
network_plugin_mode = "overlay"
pod_cidr = "10.0.0.0/9"
service_cidr = "192.168.0.0/16"
dns_service_ip = "192.168.0.10"
}
default_node_pool = {
name = "default"
node_count = 2
vm_size = "Standard_D16_v4"
os_disk_type = "Managed"
os_sku = "AzureLinux"
only_critical_addons_enabled = true
temporary_name_for_rotation = "defaulttmp"
}
extra_node_pool = [
{
name = "prompool"
node_count = 1
auto_scaling_enabled = false
vm_size = "Standard_D16_v4"
os_sku = "AzureLinux"
node_labels = { "prometheus" = "true" }
},
{
name = "userpool0"
node_count = 3
auto_scaling_enabled = false
vm_size = "Standard_D16_v4"
os_sku = "AzureLinux"
node_taints = ["cri-resource-consume=true:NoSchedule"]
node_labels = { "cri-resource-consume" = "true" }
}
]
kubernetes_version = "1.31"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"run_id" : "123456789",
"region" : "eastus"
}
17 changes: 0 additions & 17 deletions steps/topology/cri-kbench-cp/collect-clusterloader2.yml

This file was deleted.

17 changes: 0 additions & 17 deletions steps/topology/cri-kbench-cp/execute-clusterloader2.yml

This file was deleted.

16 changes: 0 additions & 16 deletions steps/topology/cri-kbench-cp/validate-resources.yml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ steps:
region: ${{ parameters.regions[0] }}
- template: /steps/engine/clusterloader2/slo/validate.yml
parameters:
desired_nodes: 14
desired_nodes: 6