Skip to content

Commit

Permalink
feat: add managing machine pools to ROSA deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
matihost committed Nov 22, 2024
1 parent 0564651 commit 1f4ecf6
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 43 deletions.
2 changes: 1 addition & 1 deletion ansible/system/inventory.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ all:
# MQ Version available at: https://public.dhe.ibm.com/ibmdl/export/pub/software/websphere/messaging/mqdev/redist/
mq: 9.4.1.0
# List of supported Ubuntu versions: https://download.opensuse.org/repositories/home:/npreining:/debian-ubuntu-onedrive
onedrive_ubuntu_version: "24.04"
onedrive_ubuntu_version: "24.10"
# List of supported Ubuntu versions: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/
pgadmin_ubuntu_version: "noble"
# List of supported Ubuntu versions: https://apt.envoyproxy.io/
Expand Down
10 changes: 9 additions & 1 deletion terraform/aws/aws-rosa/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ MODE := apply
ifeq ($(strip $(MODE)),apply)
MODE_STR := apply -auto-approve
else ifeq ($(strip $(MODE)),destroy)
MODE_STR := destroy -auto-approve
$(error Mode destroy is not supported via task run, to destroy cluster execute explicitly make destroy CLUSTER_NAME=... ENV=... )
else
MODE_STR := plan
endif
Expand All @@ -30,6 +30,14 @@ ifndef PASS
endif
cd stage/$(ENV) && { export CLUSTER_ADMIN_PASS=$(PASS); terragrunt validate && terragrunt --terragrunt-non-interactive $(MODE_STR) $(TG_FLAGS); }

destroy: # destroy cluster, usage: make destroy CLUSTER_NAME=dev-us-east-1 [ENV=dev]
ifndef CLUSTER_NAME
$(error Env CLUSTER_NAME is not defined. Cluster name is required along with DEV variable)
endif
rosa delete cluster -c $(CLUSTER_NAME) -w -y
cd stage/$(ENV) && { terragrunt state rm rhcs_hcp_machine_pool.machine_pool; terragrunt --terragrunt-non-interactive destroy -auto-approve $(TG_FLAGS); }


show-state: ## show state
cd stage/$(ENV) && terragrunt state list && terragrunt show

Expand Down
9 changes: 5 additions & 4 deletions terraform/aws/aws-rosa/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ In particular it creates:

- Latest Terraform or OpenTofu, Terragrunt installed

- jq tool installed

- ROSA CLI installed:

```bash
Expand Down Expand Up @@ -88,10 +90,10 @@ at the aws marketplace
make run ENV=dev MODE=apply PASS=cluster-admin-password

# show Terraform state
make show-state
make show-state ENV=dev

# terminates all AWS resource created with apply task
make run ENV
# destroys ROSA HCP along with all AWS resources created with apply task
make destroy CLUSTER_NAME=dev-us-east-1 ENV=dev
```

## Day 2 Operations
Expand All @@ -114,7 +116,6 @@ k9s

### Login to Web Console


Expose PROXY through bastion:

```bash
Expand Down
6 changes: 3 additions & 3 deletions terraform/aws/aws-rosa/module/account_roles.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ locals {
{
role_name = "HCP-ROSA-Installer"
role_type = "installer"
policy_details = "arn:aws:iam::aws:policy/service-role/ROSAInstallerPolicy"
policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_installer_permission_policy"]
principal_type = "AWS"
principal_identifier = "arn:${data.aws_partition.current.partition}:iam::${data.rhcs_info.current.ocm_aws_account_id}:role/RH-Managed-OpenShift-Installer"
},
{
role_name = "HCP-ROSA-Support"
role_type = "support"
policy_details = "arn:aws:iam::aws:policy/service-role/ROSASRESupportPolicy"
policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_support_permission_policy"]
principal_type = "AWS"
// This is a SRE RH Support role which is used to assume this support role
principal_identifier = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_support_rh_sre_role"]
},
{
role_name = "HCP-ROSA-Worker"
role_type = "instance_worker"
policy_details = "arn:aws:iam::aws:policy/service-role/ROSAWorkerInstancePolicy"
policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_instance_worker_permission_policy"]
principal_type = "Service"
principal_identifier = "ec2.amazonaws.com"
},
Expand Down
19 changes: 19 additions & 0 deletions terraform/aws/aws-rosa/module/delete-machine-pools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

CLUSTER_ID="${1:?CLUSTER_ID is required}"
MACHINE_POOL_PREFIX="${2:?MACHINE_POOL_PREFIX is required}"

set -e
# set -x

# shellcheck disable=SC2034
DIRNAME="$(dirname "$0")"

MACHINE_POOLS="$(rosa list machinepools -c "${CLUSTER_ID}" -o json | jq -r '.[].id')"

for pool in $MACHINE_POOLS; do
# Check if the word starts with "worker"
if [[ "${pool}" == ${MACHINE_POOL_PREFIX}* ]]; then
rosa delete machinepool -c "${CLUSTER_ID}" "${pool}" -y
fi
done
117 changes: 89 additions & 28 deletions terraform/aws/aws-rosa/module/rosa.tf
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
locals {
private_subnet_ids = [for subnet in data.aws_subnet.private : subnet.id]
installer_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Installer-Role"
sts_roles = {
role_arn = local.installer_role_arn,
support_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Support-Role",
instance_iam_roles = {
worker_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Worker-Role"
},
operator_role_prefix = local.prefix,
oidc_config_id = rhcs_rosa_oidc_config.oidc_config.id
}
installer_role_arn = aws_iam_role.account_role[0].arn

cluster_admin_credentials = var.cluster_admin_password == "" ? null : { username = "cluster-admin", password = var.cluster_admin_password }
}

Expand Down Expand Up @@ -37,7 +29,13 @@ data "aws_subnet" "private" {
}
}


# Warning:
# ROSA does not support removal via Terraform
# Attempt to remove cluster via terraform ignores depends_on and it deletes account_roles first
# causing that cluster deletion handgs forever...
# Also attempt to delete the cluster by deleting cluster nad machine objects leads to error - that it is not possible
# to delete all machine pools (there must be 2 replicas all the time)
# The only way to delete cluster and all resources is to remove it via rosa cli and then continue via tf destroy.
resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" {
name = local.prefix
domain_prefix = local.prefix
Expand All @@ -53,10 +51,18 @@ resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" {
cloud_region = var.region
aws_account_id = local.account_id
aws_billing_account_id = var.billing_account_id == "" ? local.account_id : var.billing_account_id
sts = local.sts_roles
sts = {
role_arn = local.installer_role_arn,
support_role_arn = aws_iam_role.account_role[1].arn
instance_iam_roles = {
worker_role_arn = aws_iam_role.account_role[2].arn
},
operator_role_prefix = local.prefix,
oidc_config_id = rhcs_rosa_oidc_config.oidc_config.id
}

availability_zones = var.zones
replicas = var.replicas
replicas = length(var.zones)
aws_subnet_ids = local.private_subnet_ids
compute_machine_type = var.machine_instance_type
create_admin_user = var.cluster_admin_password != ""
Expand Down Expand Up @@ -86,24 +92,26 @@ resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" {
]

lifecycle {
ignore_changes = [create_admin_user, admin_credentials, replicas]
ignore_changes = [create_admin_user, admin_credentials, replicas, compute_machine_type]
}
}


# TODO add autoscaller
resource "rhcs_hcp_cluster_autoscaler" "cluster_autoscaler" {
count = var.enable_cluster_autoscaler ? 1 : 0
# As of terraform-redhat/rhcs v1.6.6
# currently the object is not implemented, enabling it ends with error
# resource "rhcs_hcp_cluster_autoscaler" "cluster_autoscaler" {
# count = var.enable_cluster_autoscaler ? 1 : 0

cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id
max_pod_grace_period = var.autoscaler_max_pod_grace_period
pod_priority_threshold = var.autoscaler_pod_priority_threshold
max_node_provision_time = var.autoscaler_max_node_provision_time
# cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id
# max_pod_grace_period = var.autoscaler_max_pod_grace_period
# pod_priority_threshold = var.autoscaler_pod_priority_threshold
# max_node_provision_time = var.autoscaler_max_node_provision_time

resource_limits = {
max_nodes_total = var.autoscaler_max_nodes_total
}
}
# resource_limits = {
# max_nodes_total = var.autoscaler_max_nodes_total
# }
# }

resource "rhcs_hcp_default_ingress" "default_ingress" {
cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id
Expand All @@ -116,14 +124,67 @@ resource "rhcs_kubeletconfig" "kubeletconfig" {
pod_pids_limit = var.pod_limit
}

# Default machine poools (aka workers-0, workers-1,...) are not editable after initial cluster creation
# The solution is to add additional machine pools and remove default ones
resource "rhcs_hcp_machine_pool" "machine_pool" {
for_each = var.zones

cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id
# name is max 15 characters
name = "compute-${trimprefix(each.key, var.region)}"

# it is valid to have replica with 0 nodes, as soon as there are at least 2 replicas in total in the cluster left
replicas = var.enable_cluster_autoscaler ? null : var.replicas_per_zone
autoscaling = {
enabled = var.enable_cluster_autoscaler
# must be greater than zero.
min_replicas = var.enable_cluster_autoscaler ? 1 : null
max_replicas = var.enable_cluster_autoscaler ? var.autoscaler_max_nodes_per_zone : null

}
labels = var.labels
taints = var.taints
subnet_id = data.aws_subnet.private[each.key].id
aws_node_pool = {
instance_type = var.machine_instance_type
tags = var.aws_tags
}
auto_repair = true
version = var.openshift_version
upgrade_acknowledgements_for = var.max_upgrade_version
# tuning_configs = ...
# kubelet_configs = ...
ignore_deletion_error = false

lifecycle {
ignore_changes = [
cluster,
name,
]
}
}


resource "null_resource" "clean_default_machines" {
triggers = {
always_run = timestamp()
}
provisioner "local-exec" {
command = "${path.module}/delete-machine-pools.sh '${rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id}' 'workers-'"
}

depends_on = [
rhcs_cluster_rosa_hcp.rosa_hcp_cluster,
rhcs_hcp_machine_pool.machine_pool
]
}


# TODO add own IDP based on Keycloak
# example with ENtraID: https://docs.redhat.com/en/documentation/red_hat_openshift_service_on_aws/4/html-single/tutorials/index#cloud-experts-entra-id-idp-register-application
# https://registry.terraform.io/providers/terraform-redhat/rhcs/latest/docs/resources/identity_provider#nested-schema-for-openid
# https://github.com/terraform-redhat/terraform-rhcs-rosa-hcp/blob/main/modules/idp/main.tf#L127

# TODO add own worker pool and delete default one
# https://registry.terraform.io/providers/terraform-redhat/rhcs/latest/docs/guides/worker-machine-pool
# https://github.com/terraform-redhat/terraform-rhcs-rosa-hcp/blob/main/modules/machine-pool/main.tf

# TODO add custom ingress
# https://access.redhat.com/articles/7028653
# https://aws.amazon.com/blogs/containers/implementing-custom-domain-names-with-rosa/
35 changes: 31 additions & 4 deletions terraform/aws/aws-rosa/module/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ variable "zone" {
description = "Preffered AWS AZ where resources need to placed, has to be compatible with region variable"
}

# tflint-ignore: terraform_unused_declarations

variable "region" {
default = "us-east-1"
type = string
Expand Down Expand Up @@ -77,16 +77,16 @@ variable "billing_account_id" {
}


variable "replicas" {
variable "replicas_per_zone" {
type = number
default = 3
default = 1
description = "Number of worker nodes to provision. This attribute is applicable solely when autoscaling is disabled. Single zone clusters need at least 2 nodes, multizone clusters need at least 3 nodes. Hosted clusters require that the number of worker nodes be a multiple of the number of private subnets. (default: 2)"
}

variable "machine_instance_type" {
type = string
description = "Identifies the Instance type used by the default worker machine pool e.g. `m5.xlarge`"
default = null
default = "m5.xlarge"
}


Expand Down Expand Up @@ -124,6 +124,22 @@ variable "pod_limit" {
}


variable "taints" {
description = "Taints for a machine pool. This list will overwrite any modifications made to node taints on an ongoing basis."
type = list(object({
key = string
value = string
schedule_type = string
}))
default = null
}

variable "labels" {
description = "Labels for the machine pool. Format should be a comma-separated list of 'key = value'. This list will overwrite any modifications made to node labels on an ongoing basis."
type = map(string)
default = null
}

# TODO enable when feature works
# As of v1.6.6 ends with error: Autoscaler configuration is not available
variable "enable_cluster_autoscaler" {
Expand All @@ -132,26 +148,37 @@ variable "enable_cluster_autoscaler" {
description = "Whether to create enable cluster autoscaler"
}

# tflint-ignore: terraform_unused_declarations
variable "autoscaler_max_pod_grace_period" {
type = number
default = 30
description = "Gives pods graceful termination time before scaling down."
}

# tflint-ignore: terraform_unused_declarations
variable "autoscaler_pod_priority_threshold" {
type = number
default = 1
description = "To allow users to schedule 'best-effort' pods, which shouldn't trigger Cluster Autoscaler actions, but only run when there are spare resources available."
}

# tflint-ignore: terraform_unused_declarations
variable "autoscaler_max_node_provision_time" {
type = string
default = "10m"
description = "Maximum time cluster-autoscaler waits for node to be provisioned."
}

# tflint-ignore: terraform_unused_declarations
variable "autoscaler_max_nodes_total" {
type = number
default = 10
description = "Maximum number of nodes in all node groups. Cluster autoscaler will not grow the cluster beyond this number."
}

# tflint-ignore: terraform_unused_declarations
variable "autoscaler_max_nodes_per_zone" {
type = number
default = 3
description = "Maximum number of nodes per machine poll (zonal)"
}
4 changes: 2 additions & 2 deletions terraform/aws/aws-rosa/module/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ terraform {
version = "~> 1"

}
random = {
source = "hashicorp/random"
null = {
source = "hashicorp/null"
version = "~> 3"
}
}
Expand Down
2 changes: 2 additions & 0 deletions terraform/aws/aws-rosa/stage/dev/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ inputs = {
Region = "us-east1"
}
zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
# machine_instance_type = "c5.2xlarge" # 8 cores, 16 GiB RAM
enable_cluster_autoscaler = true
}

0 comments on commit 1f4ecf6

Please sign in to comment.