diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index eaca3a3ae..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,33 +170,21 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - # - name: Build environment-specific compute image - # id: packer_build - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # cd packer/ - # packer init - # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs - - # - name: Test reimage of compute nodes to new environment-specific image (via slurm) - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" - # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down - # ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Test reimage of compute nodes and compute-init (via rebuild adhoc) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible-playbook -v ansible/ci/check_slurm.yml + - name: Check sacct state survived reimage run: | . venv/bin/activate diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..6507caf08 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,9 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: "'boot' not in sinfo.stdout_lines" - retries: 5 - delay: 10 + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + retries: 10 + delay: 5 - name: Check nodes have expected slurm state assert: that: sinfo.stdout_lines == expected_sinfo diff --git a/ansible/extras.yml b/ansible/extras.yml index 6bb141109..13a887dd9 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,7 +44,6 @@ # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes - name: Export hostvars tasks: - include_role: name: compute_init diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 77a127245..db18034aa 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,10 +42,13 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -# Development/debugging +The above may be enabled by setting the compute_init_enable property on the +terraform compute variable. -To develop/debug this without actually having to build an image: +# Development/debugging +To develop/debug changes to the compute script without actually having to build +a new image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hostvars over NFS. @@ -103,7 +106,7 @@ as in step 3. available v the current approach: ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index c7a9048b4..430e2cf65 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,13 +6,13 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" - enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" - enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" - enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" - enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" - enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" - enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dae840d95..c7c1d4d8c 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,112 +2,17 @@ See the role README.md -# Results/progress +# CI workflow -Without any metadata: +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago - Main PID: 16089 (ansible-init) - Tasks: 8 (limit: 10912) - Memory: 99.5M - CPU: 11.687s - CGroup: /system.slice/ansible-init.service - ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init - ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" - ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py - ├─16363 /usr/bin/mount /mnt/cluster - └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync +1. Compute nodes are reimaged: - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** - Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None - [root@rl9-compute-0 rocky]# systemctl status ansible-init + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -Added metadata via horizon: +2. Ansible-init runs against newly reimaged compute nodes - compute_groups ["compute"] +3. Run sinfo and check nodes have expected slurm state - -OK: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago - Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 16089 (code=exited, status=0/SUCCESS) - CPU: 13.003s - - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully - Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -Now run site.yml, then restart ansible-init again: - - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago - Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 18921 (code=exited, status=0/SUCCESS) - CPU: 8.240s - - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully - Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - [root@rl9-compute-0 rocky]# ls /mnt/cluster/host - hosts hostvars/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml - -This commit - shows that hostvars have loaded: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago - Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 27585 (code=exited, status=0/SUCCESS) - CPU: 8.161s - - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully - Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c1e19058..37bd8c3d6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-1444-ecea8219", - "RL9": "openhpc-RL9-250109-1444-ecea8219" + "RL8": "openhpc-RL8-250114-1627-bccc88b5", + "RL9": "openhpc-RL9-250114-1626-bccc88b5" } } diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4284ec132..872003db3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -82,6 +82,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] } # Example of how to add another partition: # extra: { diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index ba9da127c..a90108924 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -20,6 +20,8 @@ module "compute" { root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + compute_init_enable = lookup(each.value, "compute_init_enable", []) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index ab869e28e..9bb75466e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -74,11 +74,14 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } - metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address - } + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) user_data = <<-EOF #cloud-config diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 72bcf08fd..b0e489017 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -88,3 +88,9 @@ variable "control_address" { description = "Name/address of control node" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 4d8058208..bdffd40ce 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) volume_backed_instances: Overrides variable volume_backed_instances root_volume_size: Overrides variable root_volume_size extra_volumes: Mapping defining additional volumes to create and attach @@ -142,4 +143,4 @@ variable "root_volume_size" { variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string -} +} \ No newline at end of file