diff --git a/ansible/system/inventory.yml b/ansible/system/inventory.yml index 2dc2d24..b577189 100644 --- a/ansible/system/inventory.yml +++ b/ansible/system/inventory.yml @@ -28,7 +28,7 @@ all: # MQ Version available at: https://public.dhe.ibm.com/ibmdl/export/pub/software/websphere/messaging/mqdev/redist/ mq: 9.4.1.0 # List of supported Ubuntu versions: https://download.opensuse.org/repositories/home:/npreining:/debian-ubuntu-onedrive - onedrive_ubuntu_version: "24.04" + onedrive_ubuntu_version: "24.10" # List of supported Ubuntu versions: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/ pgadmin_ubuntu_version: "noble" # List of supported Ubuntu versions: https://apt.envoyproxy.io/ diff --git a/terraform/aws/aws-rosa/Makefile b/terraform/aws/aws-rosa/Makefile index 961b8e4..2fdc618 100644 --- a/terraform/aws/aws-rosa/Makefile +++ b/terraform/aws/aws-rosa/Makefile @@ -12,7 +12,7 @@ MODE := apply ifeq ($(strip $(MODE)),apply) MODE_STR := apply -auto-approve else ifeq ($(strip $(MODE)),destroy) - MODE_STR := destroy -auto-approve + $(error Mode destroy is not supported via task run, to destroy cluster execute explicitly make destroy CLUSTER_NAME=... ENV=... ) else MODE_STR := plan endif @@ -30,6 +30,14 @@ ifndef PASS endif cd stage/$(ENV) && { export CLUSTER_ADMIN_PASS=$(PASS); terragrunt validate && terragrunt --terragrunt-non-interactive $(MODE_STR) $(TG_FLAGS); } +destroy: # destroy cluster, usage: make destroy CLUSTER_NAME=dev-us-east-1 [ENV=dev] +ifndef CLUSTER_NAME + $(error Env CLUSTER_NAME is not defined. Cluster name is required along with DEV variable) +endif + rosa delete cluster -c $(CLUSTER_NAME) -w -y + cd stage/$(ENV) && { terragrunt state rm rhcs_hcp_machine_pool.machine_pool; terragrunt --terragrunt-non-interactive destroy -auto-approve $(TG_FLAGS); } + + show-state: ## show state cd stage/$(ENV) && terragrunt state list && terragrunt show diff --git a/terraform/aws/aws-rosa/README.md b/terraform/aws/aws-rosa/README.md index 9cc46b1..2ad20a3 100644 --- a/terraform/aws/aws-rosa/README.md +++ b/terraform/aws/aws-rosa/README.md @@ -12,6 +12,8 @@ In particular it creates: - Latest Terraform or OpenTofu, Terragrunt installed +- jq tool installed + - ROSA CLI installed: ```bash @@ -88,10 +90,10 @@ at the aws marketplace make run ENV=dev MODE=apply PASS=cluster-admin-password # show Terraform state -make show-state +make show-state ENV=dev -# terminates all AWS resource created with apply task -make run ENV +# destroys ROSA HCP along with all AWS resources created with apply task +make destroy CLUSTER_NAME=dev-us-east-1 ENV=dev ``` ## Day 2 Operations @@ -114,7 +116,6 @@ k9s ### Login to Web Console - Expose PROXY through bastion: ```bash diff --git a/terraform/aws/aws-rosa/module/account_roles.tf b/terraform/aws/aws-rosa/module/account_roles.tf index 3ed260c..6c1c76a 100644 --- a/terraform/aws/aws-rosa/module/account_roles.tf +++ b/terraform/aws/aws-rosa/module/account_roles.tf @@ -3,14 +3,14 @@ locals { { role_name = "HCP-ROSA-Installer" role_type = "installer" - policy_details = "arn:aws:iam::aws:policy/service-role/ROSAInstallerPolicy" + policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_installer_permission_policy"] principal_type = "AWS" principal_identifier = "arn:${data.aws_partition.current.partition}:iam::${data.rhcs_info.current.ocm_aws_account_id}:role/RH-Managed-OpenShift-Installer" }, { role_name = "HCP-ROSA-Support" role_type = "support" - policy_details = "arn:aws:iam::aws:policy/service-role/ROSASRESupportPolicy" + policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_support_permission_policy"] principal_type = "AWS" // This is a SRE RH Support role which is used to assume this support role principal_identifier = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_support_rh_sre_role"] @@ -18,7 +18,7 @@ locals { { role_name = "HCP-ROSA-Worker" role_type = "instance_worker" - policy_details = "arn:aws:iam::aws:policy/service-role/ROSAWorkerInstancePolicy" + policy_details = data.rhcs_hcp_policies.all_policies.account_role_policies["sts_hcp_instance_worker_permission_policy"] principal_type = "Service" principal_identifier = "ec2.amazonaws.com" }, diff --git a/terraform/aws/aws-rosa/module/delete-machine-pools.sh b/terraform/aws/aws-rosa/module/delete-machine-pools.sh new file mode 100755 index 0000000..d6fb6bf --- /dev/null +++ b/terraform/aws/aws-rosa/module/delete-machine-pools.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CLUSTER_ID="${1:?CLUSTER_ID is required}" +MACHINE_POOL_PREFIX="${2:?MACHINE_POOL_PREFIX is required}" + +set -e +# set -x + +# shellcheck disable=SC2034 +DIRNAME="$(dirname "$0")" + +MACHINE_POOLS="$(rosa list machinepools -c "${CLUSTER_ID}" -o json | jq -r '.[].id')" + +for pool in $MACHINE_POOLS; do + # Check if the word starts with "worker" + if [[ "${pool}" == ${MACHINE_POOL_PREFIX}* ]]; then + rosa delete machinepool -c "${CLUSTER_ID}" "${pool}" -y + fi +done diff --git a/terraform/aws/aws-rosa/module/rosa.tf b/terraform/aws/aws-rosa/module/rosa.tf index a122a66..18fe6e3 100644 --- a/terraform/aws/aws-rosa/module/rosa.tf +++ b/terraform/aws/aws-rosa/module/rosa.tf @@ -1,15 +1,7 @@ locals { private_subnet_ids = [for subnet in data.aws_subnet.private : subnet.id] - installer_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Installer-Role" - sts_roles = { - role_arn = local.installer_role_arn, - support_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Support-Role", - instance_iam_roles = { - worker_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.prefix}-HCP-ROSA-Worker-Role" - }, - operator_role_prefix = local.prefix, - oidc_config_id = rhcs_rosa_oidc_config.oidc_config.id - } + installer_role_arn = aws_iam_role.account_role[0].arn + cluster_admin_credentials = var.cluster_admin_password == "" ? null : { username = "cluster-admin", password = var.cluster_admin_password } } @@ -37,7 +29,13 @@ data "aws_subnet" "private" { } } - +# Warning: +# ROSA does not support removal via Terraform +# Attempt to remove cluster via terraform ignores depends_on and it deletes account_roles first +# causing that cluster deletion handgs forever... +# Also attempt to delete the cluster by deleting cluster nad machine objects leads to error - that it is not possible +# to delete all machine pools (there must be 2 replicas all the time) +# The only way to delete cluster and all resources is to remove it via rosa cli and then continue via tf destroy. resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" { name = local.prefix domain_prefix = local.prefix @@ -53,10 +51,18 @@ resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" { cloud_region = var.region aws_account_id = local.account_id aws_billing_account_id = var.billing_account_id == "" ? local.account_id : var.billing_account_id - sts = local.sts_roles + sts = { + role_arn = local.installer_role_arn, + support_role_arn = aws_iam_role.account_role[1].arn + instance_iam_roles = { + worker_role_arn = aws_iam_role.account_role[2].arn + }, + operator_role_prefix = local.prefix, + oidc_config_id = rhcs_rosa_oidc_config.oidc_config.id + } availability_zones = var.zones - replicas = var.replicas + replicas = length(var.zones) aws_subnet_ids = local.private_subnet_ids compute_machine_type = var.machine_instance_type create_admin_user = var.cluster_admin_password != "" @@ -86,24 +92,26 @@ resource "rhcs_cluster_rosa_hcp" "rosa_hcp_cluster" { ] lifecycle { - ignore_changes = [create_admin_user, admin_credentials, replicas] + ignore_changes = [create_admin_user, admin_credentials, replicas, compute_machine_type] } } # TODO add autoscaller -resource "rhcs_hcp_cluster_autoscaler" "cluster_autoscaler" { - count = var.enable_cluster_autoscaler ? 1 : 0 +# As of terraform-redhat/rhcs v1.6.6 +# currently the object is not implemented, enabling it ends with error +# resource "rhcs_hcp_cluster_autoscaler" "cluster_autoscaler" { +# count = var.enable_cluster_autoscaler ? 1 : 0 - cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id - max_pod_grace_period = var.autoscaler_max_pod_grace_period - pod_priority_threshold = var.autoscaler_pod_priority_threshold - max_node_provision_time = var.autoscaler_max_node_provision_time +# cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id +# max_pod_grace_period = var.autoscaler_max_pod_grace_period +# pod_priority_threshold = var.autoscaler_pod_priority_threshold +# max_node_provision_time = var.autoscaler_max_node_provision_time - resource_limits = { - max_nodes_total = var.autoscaler_max_nodes_total - } -} +# resource_limits = { +# max_nodes_total = var.autoscaler_max_nodes_total +# } +# } resource "rhcs_hcp_default_ingress" "default_ingress" { cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id @@ -116,14 +124,67 @@ resource "rhcs_kubeletconfig" "kubeletconfig" { pod_pids_limit = var.pod_limit } +# Default machine poools (aka workers-0, workers-1,...) are not editable after initial cluster creation +# The solution is to add additional machine pools and remove default ones +resource "rhcs_hcp_machine_pool" "machine_pool" { + for_each = var.zones + + cluster = rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id + # name is max 15 characters + name = "compute-${trimprefix(each.key, var.region)}" + + # it is valid to have replica with 0 nodes, as soon as there are at least 2 replicas in total in the cluster left + replicas = var.enable_cluster_autoscaler ? null : var.replicas_per_zone + autoscaling = { + enabled = var.enable_cluster_autoscaler + # must be greater than zero. + min_replicas = var.enable_cluster_autoscaler ? 1 : null + max_replicas = var.enable_cluster_autoscaler ? var.autoscaler_max_nodes_per_zone : null + + } + labels = var.labels + taints = var.taints + subnet_id = data.aws_subnet.private[each.key].id + aws_node_pool = { + instance_type = var.machine_instance_type + tags = var.aws_tags + } + auto_repair = true + version = var.openshift_version + upgrade_acknowledgements_for = var.max_upgrade_version + # tuning_configs = ... + # kubelet_configs = ... + ignore_deletion_error = false + + lifecycle { + ignore_changes = [ + cluster, + name, + ] + } +} + + +resource "null_resource" "clean_default_machines" { + triggers = { + always_run = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/delete-machine-pools.sh '${rhcs_cluster_rosa_hcp.rosa_hcp_cluster.id}' 'workers-'" + } + + depends_on = [ + rhcs_cluster_rosa_hcp.rosa_hcp_cluster, + rhcs_hcp_machine_pool.machine_pool + ] +} + + # TODO add own IDP based on Keycloak +# example with ENtraID: https://docs.redhat.com/en/documentation/red_hat_openshift_service_on_aws/4/html-single/tutorials/index#cloud-experts-entra-id-idp-register-application # https://registry.terraform.io/providers/terraform-redhat/rhcs/latest/docs/resources/identity_provider#nested-schema-for-openid # https://github.com/terraform-redhat/terraform-rhcs-rosa-hcp/blob/main/modules/idp/main.tf#L127 -# TODO add own worker pool and delete default one -# https://registry.terraform.io/providers/terraform-redhat/rhcs/latest/docs/guides/worker-machine-pool -# https://github.com/terraform-redhat/terraform-rhcs-rosa-hcp/blob/main/modules/machine-pool/main.tf - # TODO add custom ingress # https://access.redhat.com/articles/7028653 # https://aws.amazon.com/blogs/containers/implementing-custom-domain-names-with-rosa/ diff --git a/terraform/aws/aws-rosa/module/variables.tf b/terraform/aws/aws-rosa/module/variables.tf index b2508cf..fca0260 100644 --- a/terraform/aws/aws-rosa/module/variables.tf +++ b/terraform/aws/aws-rosa/module/variables.tf @@ -25,7 +25,7 @@ variable "zone" { description = "Preffered AWS AZ where resources need to placed, has to be compatible with region variable" } -# tflint-ignore: terraform_unused_declarations + variable "region" { default = "us-east-1" type = string @@ -77,16 +77,16 @@ variable "billing_account_id" { } -variable "replicas" { +variable "replicas_per_zone" { type = number - default = 3 + default = 1 description = "Number of worker nodes to provision. This attribute is applicable solely when autoscaling is disabled. Single zone clusters need at least 2 nodes, multizone clusters need at least 3 nodes. Hosted clusters require that the number of worker nodes be a multiple of the number of private subnets. (default: 2)" } variable "machine_instance_type" { type = string description = "Identifies the Instance type used by the default worker machine pool e.g. `m5.xlarge`" - default = null + default = "m5.xlarge" } @@ -124,6 +124,22 @@ variable "pod_limit" { } +variable "taints" { + description = "Taints for a machine pool. This list will overwrite any modifications made to node taints on an ongoing basis." + type = list(object({ + key = string + value = string + schedule_type = string + })) + default = null +} + +variable "labels" { + description = "Labels for the machine pool. Format should be a comma-separated list of 'key = value'. This list will overwrite any modifications made to node labels on an ongoing basis." + type = map(string) + default = null +} + # TODO enable when feature works # As of v1.6.6 ends with error: Autoscaler configuration is not available variable "enable_cluster_autoscaler" { @@ -132,26 +148,37 @@ variable "enable_cluster_autoscaler" { description = "Whether to create enable cluster autoscaler" } +# tflint-ignore: terraform_unused_declarations variable "autoscaler_max_pod_grace_period" { type = number default = 30 description = "Gives pods graceful termination time before scaling down." } +# tflint-ignore: terraform_unused_declarations variable "autoscaler_pod_priority_threshold" { type = number default = 1 description = "To allow users to schedule 'best-effort' pods, which shouldn't trigger Cluster Autoscaler actions, but only run when there are spare resources available." } +# tflint-ignore: terraform_unused_declarations variable "autoscaler_max_node_provision_time" { type = string default = "10m" description = "Maximum time cluster-autoscaler waits for node to be provisioned." } +# tflint-ignore: terraform_unused_declarations variable "autoscaler_max_nodes_total" { type = number default = 10 description = "Maximum number of nodes in all node groups. Cluster autoscaler will not grow the cluster beyond this number." } + +# tflint-ignore: terraform_unused_declarations +variable "autoscaler_max_nodes_per_zone" { + type = number + default = 3 + description = "Maximum number of nodes per machine poll (zonal)" +} diff --git a/terraform/aws/aws-rosa/module/versions.tf b/terraform/aws/aws-rosa/module/versions.tf index 3ce6694..9d20881 100644 --- a/terraform/aws/aws-rosa/module/versions.tf +++ b/terraform/aws/aws-rosa/module/versions.tf @@ -9,8 +9,8 @@ terraform { version = "~> 1" } - random = { - source = "hashicorp/random" + null = { + source = "hashicorp/null" version = "~> 3" } } diff --git a/terraform/aws/aws-rosa/stage/dev/terragrunt.hcl b/terraform/aws/aws-rosa/stage/dev/terragrunt.hcl index 0ca8d9c..357be88 100644 --- a/terraform/aws/aws-rosa/stage/dev/terragrunt.hcl +++ b/terraform/aws/aws-rosa/stage/dev/terragrunt.hcl @@ -18,4 +18,6 @@ inputs = { Region = "us-east1" } zones = ["us-east-1a", "us-east-1b", "us-east-1c"] + # machine_instance_type = "c5.2xlarge" # 8 cores, 16 GiB RAM + enable_cluster_autoscaler = true }