From 510cfd01ea27e9356a0cd07b000ca0363f3dbfad Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 6 Jan 2025 10:00:25 +0000 Subject: [PATCH 01/15] extend cookiecutter terraform config for compute init script --- .../terraform/compute.tf | 2 ++ .../terraform/compute/nodes.tf | 13 ++++++++++--- .../terraform/compute/variables.tf | 6 ++++++ .../terraform/variables.tf | 7 +++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 14c728a5a..d52c3c42c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -18,4 +18,6 @@ module "compute" { k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] + + compute_init_enable = var.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7a2a706a6..ac34a443c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -45,9 +45,16 @@ resource "openstack_compute_instance_v2" "compute" { } metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + enable_compute = contains(var.compute_init_enable, "compute") + enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") + enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") + enable_nfs = contains(var.compute_init_enable, "nfs") + enable_manila = contains(var.compute_init_enable, "manila") + enable_basic_users = contains(var.compute_init_enable, "basic_users") + enable_eessi = contains(var.compute_init_enable, "eessi") } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 3655c9e65..a0e90c61b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -76,3 +76,9 @@ variable "control_address" { description = "Name/address of control node" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0f5eefa18..19027dd19 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + compute_init_enable: Toggles ansible-init rebuild EOF } @@ -136,3 +137,9 @@ variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file From 8290a313885dea62421bee125c9460acecf9570a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 11:04:21 +0000 Subject: [PATCH 02/15] define default compute init flags --- environments/.stackhpc/terraform/compute_init.auto.tfvars | 7 +++++++ environments/.stackhpc/terraform/main.tf | 5 +++++ 2 files changed, 12 insertions(+) create mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars new file mode 100644 index 000000000..032ae5adb --- /dev/null +++ b/environments/.stackhpc/terraform/compute_init.auto.tfvars @@ -0,0 +1,7 @@ +compute_init_enable = [ + "compute", + "etc_hosts", + "nfs", + "basic_users", + "eessi" +] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4284ec132..d54903cc4 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,6 +58,10 @@ variable "k3s_token" { type = string } +variable "compute_init_enable" { + type = list(string) +} + data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -74,6 +78,7 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token + compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor From 354ce1e810f4be3836919ac250e3fbb9e1634f9e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 12:47:42 +0000 Subject: [PATCH 03/15] add CI tests for compute node rebuilds --- .github/workflows/stackhpc.yml | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..a5267e508 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,33 +170,22 @@ jobs: env: TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - # - name: Build environment-specific compute image - # id: packer_build - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # cd packer/ - # packer init - # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs - - # - name: Test reimage of compute nodes to new environment-specific image (via slurm) - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" - # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down - # ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Test reimage of compute nodes and compute-init (via rebuild adhoc) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down + ansible-playbook -v ansible/ci/check_slurm.yml + - name: Check sacct state survived reimage run: | . venv/bin/activate From b903cdd0a3350d15eee65a8f4835477e21ed15ca Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 14:59:10 +0000 Subject: [PATCH 04/15] document metadata toggle flags and CI workflow --- ansible/roles/compute_init/README.md | 31 +++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 77a127245..40d9b7326 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,10 +42,35 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -# Development/debugging +All of the above are defined in the skeleton cookiecutter config, and are +toggleable via a terraform compute_init autovar file. In the .stackhpc +environment, the compute init roles are set by default to: +- `enable_compute`: This encompasses the openhpc role functionality while being + a global toggle for the entire compute-init script. +- `etc_hosts` +- `nfs` +- `basic_users` +- `eessi` + +# CI workflow + +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows + +1. Compute nodes are reimaged: + + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -To develop/debug this without actually having to build an image: +2. Ansible-init runs against newly reimaged compute nodes + +3. Run sinfo and check nodes have expected slurm state + + ansible-playbook -v ansible/ci/check_slurm.yml + +# Development/debugging +To develop/debug changes to the compute script without actually having to build +a new image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hostvars over NFS. @@ -103,7 +128,7 @@ as in step 3. available v the current approach: ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", From 2bea51cdb0ec0cb32471372e908238b82f581c16 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 8 Jan 2025 16:03:13 +0000 Subject: [PATCH 05/15] review suggestions --- .github/workflows/stackhpc.yml | 1 - ansible/roles/compute_init/README.md | 26 +--- docs/experimental/compute-init.md | 111 ++---------------- .../terraform/compute_init.auto.tfvars | 7 -- environments/.stackhpc/terraform/main.tf | 6 +- .../terraform/compute.tf | 2 +- .../terraform/compute/nodes.tf | 20 ++-- .../terraform/variables.tf | 8 +- 8 files changed, 21 insertions(+), 160 deletions(-) delete mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index a5267e508..ea18a2274 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 40d9b7326..db18034aa 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,30 +42,8 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -All of the above are defined in the skeleton cookiecutter config, and are -toggleable via a terraform compute_init autovar file. In the .stackhpc -environment, the compute init roles are set by default to: -- `enable_compute`: This encompasses the openhpc role functionality while being - a global toggle for the entire compute-init script. -- `etc_hosts` -- `nfs` -- `basic_users` -- `eessi` - -# CI workflow - -The compute node rebuild is tested in CI after the tests for rebuilding the -login and control nodes. The process follows - -1. Compute nodes are reimaged: - - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - -2. Ansible-init runs against newly reimaged compute nodes - -3. Run sinfo and check nodes have expected slurm state - - ansible-playbook -v ansible/ci/check_slurm.yml +The above may be enabled by setting the compute_init_enable property on the +terraform compute variable. # Development/debugging diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dae840d95..c7c1d4d8c 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,112 +2,17 @@ See the role README.md -# Results/progress +# CI workflow -Without any metadata: +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago - Main PID: 16089 (ansible-init) - Tasks: 8 (limit: 10912) - Memory: 99.5M - CPU: 11.687s - CGroup: /system.slice/ansible-init.service - ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init - ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" - ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py - ├─16363 /usr/bin/mount /mnt/cluster - └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync +1. Compute nodes are reimaged: - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** - Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None - [root@rl9-compute-0 rocky]# systemctl status ansible-init + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -Added metadata via horizon: +2. Ansible-init runs against newly reimaged compute nodes - compute_groups ["compute"] +3. Run sinfo and check nodes have expected slurm state - -OK: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago - Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 16089 (code=exited, status=0/SUCCESS) - CPU: 13.003s - - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully - Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -Now run site.yml, then restart ansible-init again: - - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago - Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 18921 (code=exited, status=0/SUCCESS) - CPU: 8.240s - - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully - Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - [root@rl9-compute-0 rocky]# ls /mnt/cluster/host - hosts hostvars/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml - -This commit - shows that hostvars have loaded: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago - Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 27585 (code=exited, status=0/SUCCESS) - CPU: 8.161s - - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully - Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars deleted file mode 100644 index 032ae5adb..000000000 --- a/environments/.stackhpc/terraform/compute_init.auto.tfvars +++ /dev/null @@ -1,7 +0,0 @@ -compute_init_enable = [ - "compute", - "etc_hosts", - "nfs", - "basic_users", - "eessi" -] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d54903cc4..872003db3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,10 +58,6 @@ variable "k3s_token" { type = string } -variable "compute_init_enable" { - type = list(string) -} - data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -78,7 +74,6 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token - compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor @@ -87,6 +82,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] } # Example of how to add another partition: # extra: { diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index d52c3c42c..dcc692c1a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -19,5 +19,5 @@ module "compute" { control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - compute_init_enable = var.compute_init_enable + compute_init_enable = each.value.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index ac34a443c..d3a37bc5b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -44,18 +44,14 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } - metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address - enable_compute = contains(var.compute_init_enable, "compute") - enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") - enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") - enable_nfs = contains(var.compute_init_enable, "nfs") - enable_manila = contains(var.compute_init_enable, "manila") - enable_basic_users = contains(var.compute_init_enable, "basic_users") - enable_eessi = contains(var.compute_init_enable, "eessi") - } + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) user_data = <<-EOF #cloud-config diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 19027dd19..b2e16c942 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,7 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile - compute_init_enable: Toggles ansible-init rebuild + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) EOF } @@ -136,10 +136,4 @@ variable "root_volume_size" { variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string -} - -variable "compute_init_enable" { - type = list(string) - description = "Groups to activate for ansible-init compute rebuilds" - default = [] } \ No newline at end of file From 038ddf744a0d4dc9e79b3d84620bff97fbf71b21 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 09:01:22 +0000 Subject: [PATCH 06/15] add delay for ansible-init to finish --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index ea18a2274..4d0fbb9bb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,6 +183,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 7057c5090cd918c99d6339ba60a71eede8e5a004 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 12:04:29 +0000 Subject: [PATCH 07/15] remove delay in compute node rebuild ci --- .github/workflows/stackhpc.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b2651af2f..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 3faa81382941174657f0b2a8c9cf35f135c9debc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 14:25:50 +0000 Subject: [PATCH 08/15] fix compute init metadata flags --- ansible/roles/compute_init/files/compute-init.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index c7a9048b4..430e2cf65 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,13 +6,13 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" - enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" - enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" - enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" - enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" - enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" - enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] From bc16dbaa25da04d4a350413d343a18fbcb0f7e68 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 15:14:02 +0000 Subject: [PATCH 09/15] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 47681ea8a..cb4b4e32e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250108-1703-e515b902", - "RL9": "openhpc-RL9-250108-1703-e515b902" + "RL8": "openhpc-RL8-250109-1431-3faa8138", + "RL9": "openhpc-RL9-250109-1431-3faa8138" } } From d2e18d0c5346509abc7546bdd70fc74a5ca87e5e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 09:16:10 +0000 Subject: [PATCH 10/15] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c1e19058..3c43e02eb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-1444-ecea8219", - "RL9": "openhpc-RL9-250109-1444-ecea8219" + "RL8": "openhpc-RL8-250109-2102-5193ba2f", + "RL9": "openhpc-RL9-250110-0016-5193ba2f" } } From 438ed3ad6f40916e4256070846724f298f8c274d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 17:37:44 +0000 Subject: [PATCH 11/15] adjust check_slurm logic to deal with idle* state --- ansible/ci/check_slurm.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..6507caf08 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,9 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: "'boot' not in sinfo.stdout_lines" - retries: 5 - delay: 10 + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + retries: 10 + delay: 5 - name: Check nodes have expected slurm state assert: that: sinfo.stdout_lines == expected_sinfo From fd5cbf992bfa9aca2e018e2051172a8348e2ec70 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 09:25:07 +0000 Subject: [PATCH 12/15] pause in workflow to debug slurm state --- .github/workflows/stackhpc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index d5bd313ca..35630d4dc 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,6 +185,9 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Pause for debugging + run: sleep 1800 + - name: Check sacct state survived reimage run: | . venv/bin/activate From f661c7fef6a741fe715d24815f7350b66d2e64ea Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 10:49:01 +0000 Subject: [PATCH 13/15] debug wait on failure --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 35630d4dc..f8b0167ae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -186,7 +186,8 @@ jobs: ansible-playbook -v ansible/ci/check_slurm.yml - name: Pause for debugging - run: sleep 1800 + if: failure() + run: sleep 3600 - name: Check sacct state survived reimage run: | From 81c316a594aa3bc602350d80ea31e4731c11d001 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 15:40:33 +0000 Subject: [PATCH 14/15] allow empty compute_init_enable list --- .github/workflows/stackhpc.yml | 4 ---- ansible/extras.yml | 1 - .../{{cookiecutter.environment}}/terraform/compute.tf | 4 ++-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index f8b0167ae..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,10 +185,6 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Pause for debugging - if: failure() - run: sleep 3600 - - name: Check sacct state survived reimage run: | . venv/bin/activate diff --git a/ansible/extras.yml b/ansible/extras.yml index 6bb141109..13a887dd9 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,7 +44,6 @@ # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes - name: Export hostvars tasks: - include_role: name: compute_init diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 20fcd5d89..a90108924 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -20,11 +20,11 @@ module "compute" { root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + compute_init_enable = lookup(each.value, "compute_init_enable", []) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - - compute_init_enable = each.value.compute_init_enable } From 9897f29b7220a6f7bce6b06a2da41c6b2d068158 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 17:04:16 +0000 Subject: [PATCH 15/15] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c43e02eb..37bd8c3d6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-2102-5193ba2f", - "RL9": "openhpc-RL9-250110-0016-5193ba2f" + "RL8": "openhpc-RL8-250114-1627-bccc88b5", + "RL9": "openhpc-RL9-250114-1626-bccc88b5" } }