From 875f61e378c456283da015a5264a4f9c5aad2353 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 11:57:50 +0000 Subject: [PATCH 01/29] add changes from branch rl9 --- ansible/roles/mysql/tasks/install.yml | 8 +++++++- ansible/roles/mysql/templates/mysql.service.j2 | 2 +- .../roles/opensearch/templates/opensearch.service.j2 | 2 +- environments/.stackhpc/hooks/post-bootstrap.yml | 11 ++++++----- environments/.stackhpc/terraform/main.tf | 4 ++-- requirements.yml | 2 +- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 4427b7d18..7fc43d292 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -1,6 +1,12 @@ +- name: Install pip + dnf: + name: python3-pip + - name: Install python mysql client pip: - name: pymysql + name: + - pymysql + - cryptography state: present - name: Create systemd mysql container unit file diff --git a/ansible/roles/mysql/templates/mysql.service.j2 b/ansible/roles/mysql/templates/mysql.service.j2 index 3b531cd3f..794035a8e 100644 --- a/ansible/roles/mysql/templates/mysql.service.j2 +++ b/ansible/roles/mysql/templates/mysql.service.j2 @@ -26,7 +26,7 @@ ExecStart=/usr/bin/podman run \ --volume {{ mysql_datadir }}:/var/lib/mysql:U \ --publish 3306:3306 \ --env MYSQL_ROOT_PASSWORD=${MYSQL_INITIAL_ROOT_PASSWORD} \ - mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ + docker.io/library/mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ --{{ opt }}{% endfor %} ExecStop=/usr/bin/podman stop --ignore mysql -t 10 diff --git a/ansible/roles/opensearch/templates/opensearch.service.j2 b/ansible/roles/opensearch/templates/opensearch.service.j2 index 6951bafc0..2d98305eb 100644 --- a/ansible/roles/opensearch/templates/opensearch.service.j2 +++ b/ansible/roles/opensearch/templates/opensearch.service.j2 @@ -29,7 +29,7 @@ ExecStart=/usr/bin/podman run \ --env bootstrap.memory_lock=true \ --env "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ --env DISABLE_INSTALL_DEMO_CONFIG=true \ - opensearchproject/opensearch:{{ opensearch_version }} + docker.io/opensearchproject/opensearch:{{ opensearch_version }} ExecStop=/usr/bin/podman stop --ignore opensearch -t 10 # note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped ExecStopPost=/usr/bin/podman rm --ignore -f opensearch diff --git a/environments/.stackhpc/hooks/post-bootstrap.yml b/environments/.stackhpc/hooks/post-bootstrap.yml index fe783e469..df3902698 100644 --- a/environments/.stackhpc/hooks/post-bootstrap.yml +++ b/environments/.stackhpc/hooks/post-bootstrap.yml @@ -3,14 +3,15 @@ gather_facts: false tags: podman tasks: - - name: Configure container image registry for unqualified searches to avoid docker.io ratelimits + - name: Configure container image registry to avoid docker.io ratelimits copy: - dest: /etc/containers/registries.conf.d/003-arcus-unqualfied-overrides.conf + dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf content: | - unqualified-search-registries = ['{{ podman_registry_address | split('/') | first }}', 'registry.access.redhat.com', 'registry.redhat.io', 'docker.io'] - [[registry]] - prefix = "{{ podman_registry_address }}" + location="docker.io/library/" + prefix="docker.io/library/" + + [[registry.mirror]] location = "{{ podman_registry_address }}" insecure = true when: "ci_cloud == 'ARCUS'" diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 0ab3be5ee..3f6643598 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,8 +13,8 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 - # default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" + # default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 + default = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" } variable "cluster_net" {} diff --git a/requirements.yml b/requirements.yml index 3587966aa..bc5451220 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.23.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/165 + version: rl9 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 458cc0ad1c3e0ea032946691b3287d45d37696bc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 14:34:25 +0000 Subject: [PATCH 02/29] fix unqualified names for container pulls --- ansible/roles/mysql/tasks/install.yml | 2 +- ansible/roles/opensearch/tasks/install.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 7fc43d292..4ed5d30ba 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -17,6 +17,6 @@ - name: Pull container image containers.podman.podman_image: - name: "mysql" + name: docker.io/library/mysql tag: "{{ mysql_tag }}" become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 81547e5a0..9a0ffd361 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -16,7 +16,7 @@ - name: Pull container image containers.podman.podman_image: - name: "opensearchproject/opensearch" + name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" become_user: "{{ opensearch_podman_user }}" From 64071cfb5f6e18f477aaed96931d5e487e7b96c1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 15:30:48 +0000 Subject: [PATCH 03/29] fix openondemand install --- ansible/roles/openondemand/tasks/main.yml | 2 +- requirements.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 34e1ac223..86184f13c 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -10,7 +10,7 @@ - include_role: name: osc.ood tasks_from: install-package.yml - vars_from: Rocky/8.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" public: yes # Expose the vars from this role to the rest of the play # can't set vars: from a dict hence the workaround above diff --git a/requirements.yml b/requirements.yml index bc5451220..c675003fa 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: rl9 + version: rl9 # TODO: FIXME: doesn't work on RL8 at the moment name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 5ffbf33facb0bc51ad9a000a4c47bec8f40afdbf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 15:33:01 +0000 Subject: [PATCH 04/29] bugfix filebeat unit reload --- ansible/roles/filebeat/tasks/install.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index 8e64722ec..6514e3028 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -15,3 +15,4 @@ - name: Reload filebeat unit file command: systemctl daemon-reload when: _filebeat_unit.changed + become: true From 0982f41fbf486f354901617e571a874853daab68 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 15:35:40 +0000 Subject: [PATCH 05/29] comment on required image --- environments/.stackhpc/terraform/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 3f6643598..ebaebaa2d 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -14,7 +14,7 @@ variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string # default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 - default = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + default = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" # TODO: create packer build } variable "cluster_net" {} From 37e799a2e454cc745d54e129063535c18fe8409f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 16:37:21 +0000 Subject: [PATCH 06/29] bump fat image base to RL9.3 --- environments/.stackhpc/ARCUS.pkrvars.hcl | 2 +- environments/.stackhpc/SMS.pkrvars.hcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 2b1bbfb39..73d6d8c99 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -4,7 +4,7 @@ volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index cd9fe589a..f3678b23f 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -1,7 +1,7 @@ flavor = "general.v1.tiny" networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # stackhpc-ipv4-geneve source_image_name = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] From 9d5688e1bb9282f22ca1b42312885a87b0345c85 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 16:57:37 +0000 Subject: [PATCH 07/29] fix dbus-launch command for OOD desktop --- ansible/roles/openondemand/tasks/vnc_compute.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index bde13c383..5f403bf86 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -17,6 +17,7 @@ - turbovnc-3.0.1 - nmap-ncat - python3.9 + - dbus-x11 - name: Install Xfce desktop tags: install From 6e8ea6c4d680e30b40f6fc87591891675e71c13e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jan 2024 17:15:34 +0000 Subject: [PATCH 08/29] fix OOD desktop launch --- environments/common/inventory/group_vars/all/openondemand.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index b7bdfdabc..18e741ce7 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -49,6 +49,8 @@ openondemand_clusters: module purge export PATH=/opt/TurboVNC/bin:$PATH + # avoid "Failed to create secure directory (/run/user/*/pulse)" + export XDG_RUNTIME_DIR="$TMPDIR/xdg_runtime" # Workaround to avoid "Unable to contact settings server" when # lauching xfce4-session From e5608d9e9646566e96630d275bbebf0a19d87eb8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 25 Jan 2024 10:53:26 +0000 Subject: [PATCH 09/29] fix useradd warning: {grafana,prometheus}'s uid * outside of the UID_MIN 1000 and UID_MAX 60000 range. --- environments/common/inventory/group_vars/all/defaults.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 23448c80d..91db4dc3a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -58,6 +58,7 @@ appliances_local_users_default: uid: 981 home: "{{ prometheus_db_dir }}" shell: /usr/sbin/nologin + system: true enable: "{{ 'prometheus' in group_names }}" - group: @@ -69,6 +70,7 @@ appliances_local_users_default: uid: 984 home: /usr/share/grafana shell: /sbin/nologin + system: true enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. From f439ee430a9af52309e9f182b35377e75f19a2a0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Feb 2024 14:20:11 +0000 Subject: [PATCH 10/29] fix manila support in RL9 --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index bfc087fff..c38ace88a 100644 --- a/requirements.yml +++ b/requirements.yml @@ -22,7 +22,7 @@ roles: version: v3.0.6 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.1.0 + version: feat/RL9 # TODO: bump to release collections: - name: containers.podman From 29a0d2a45c57e3d02f66cbd6a16fa531b187e7e4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 21 Feb 2024 16:20:04 +0000 Subject: [PATCH 11/29] bump openhpc role after merge --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index c38ace88a..399ff8ffa 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: rl9 # TODO: FIXME: doesn't work on RL8 at the moment + version: 7f547d0 # TODO: bump on release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 048e96cf08938a20c29c299f7c1a801e71ef1c50 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 23 Feb 2024 10:15:43 +0000 Subject: [PATCH 12/29] downsize arcus control node to try to avoid CI failures --- environments/.stackhpc/terraform/ARCUS.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/ARCUS.tfvars b/environments/.stackhpc/terraform/ARCUS.tfvars index 8ebf93478..d8c2a7350 100644 --- a/environments/.stackhpc/terraform/ARCUS.tfvars +++ b/environments/.stackhpc/terraform/ARCUS.tfvars @@ -1,5 +1,5 @@ cluster_net = "portal-internal" cluster_subnet = "portal-internal" vnic_type = "normal" -control_node_flavor = "vm.ska.cpu.general.quarter" +control_node_flavor = "vm.ska.cpu.general.eighth" other_node_flavor = "vm.ska.cpu.general.small" From 61302c52a06bd4e789a1c0168bd4fb19e7a6f47d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 23 Feb 2024 15:56:15 +0000 Subject: [PATCH 13/29] prevent ssh hanging after NFS server reimaged in CI --- ansible/bootstrap.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 9b6fda0de..1034fff50 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -40,7 +40,12 @@ - hosts: cluster gather_facts: false tasks: - - name: Add groups + - name: Prevent ssh hanging if shared home is unavailable + lineinfile: + path: /etc/profile + search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ + state: absent + - name: Add system user groups ansible.builtin.group: "{{ item.group }}" loop: "{{ appliances_local_users }}" when: @@ -50,7 +55,7 @@ # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: '-i' become: true - - name: Add users + - name: Add system users ansible.builtin.user: "{{ item.user }}" loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool From 880364a244ecdf2f9913864a69465ee1fa9db2c3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Mar 2024 11:47:18 +0000 Subject: [PATCH 14/29] fix 'prevent ssh hanging after NFS server reimaged in CI' --- ansible/bootstrap.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 1034fff50..ea56d1c9b 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -45,6 +45,7 @@ path: /etc/profile search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ state: absent + become: yes - name: Add system user groups ansible.builtin.group: "{{ item.group }}" loop: "{{ appliances_local_users }}" From 6898fbf627d0151272127dc22dcc9e4810604bed Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Mar 2024 13:58:35 +0000 Subject: [PATCH 15/29] fix import of osc GPG key for RL9 --- ansible/fatimage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index dbbfe815b..88329225f 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -56,7 +56,7 @@ - hosts: builder become: yes - gather_facts: no + gather_facts: yes tasks: # - import_playbook: slurm.yml: - name: Setup DB @@ -69,7 +69,7 @@ tasks_from: install.yml - name: Include distribution variables for osc.ood - include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml" + include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" # FUTURE: install-apps.yml - this is git clones # - import_playbook: portal.yml From fa4026da53f8776c0cac7b5e1f061b1333193b3b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Mar 2024 17:07:01 +0000 Subject: [PATCH 16/29] make fatimage more like openondemand role --- ansible/fatimage.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 88329225f..a0b7139ee 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -68,15 +68,13 @@ name: stackhpc.openhpc tasks_from: install.yml - - name: Include distribution variables for osc.ood - include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" - # FUTURE: install-apps.yml - this is git clones - # - import_playbook: portal.yml - name: Open Ondemand server - import_role: + include_role: name: osc.ood tasks_from: install-package.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" + # # FUTURE: install-apps.yml - this is git clones - name: Open Ondemand remote desktop import_role: name: openondemand From 21e06a10db3b82bcf85704eb4ebf1ede899bcf4f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 13:20:56 +0000 Subject: [PATCH 17/29] enable fatimage build on either RL8 or RL9, inc on PRs --- .github/workflows/fatimage.yml | 20 ++++++- environments/.stackhpc/ARCUS.pkrvars.hcl | 4 +- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 2 - packer/openstack.pkr.hcl | 57 ++++++-------------- 4 files changed, 35 insertions(+), 48 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 6c33c6ee9..8771f3dd9 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,12 +1,26 @@ name: Build fat image -on: +'on': workflow_dispatch: + inputs: + use_RL9: + required: true + description: Include RL9 image build + type: boolean + default: false jobs: openstack: name: openstack-imagebuild - concurrency: ${{ github.ref }} # to branch/PR + concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS runs-on: ubuntu-20.04 + strategy: + matrix: + os_version: [RL8, RL9] + rl9_selected: + - ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch + exclude: + - os_version: RL9 + rl9_selected: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -48,6 +62,8 @@ jobs: cd packer/ packer init . PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl + env: + PKR_VAR_os_version: ${{ matrix.os_version }} - name: Get created image name from manifest id: manifest diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 73d6d8c99..738a021c0 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -1,10 +1,8 @@ flavor = "vm.ska.cpu.general.small" use_blockstorage_volume = true -volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny +volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) -source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index bb778b0ae..1e2b6d3e2 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -4,8 +4,6 @@ volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny volume_type = "unencrypted" image_disk_format = "qcow2" networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci -source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index d3a0283d7..3bc44d609 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -39,26 +39,26 @@ variable "networks" { type = list(string) } -# Must supply either source_image_name or source_image -variable "source_image_name" { +# Must supply either fatimage_source_image_name or fatimage_source_image +variable "os_version" { type = string - default = null + description = "RL8 or RL9" } -variable "source_image" { - type = string - default = null -} - -# Must supply either fatimage_source_image_name or fatimage_source_image variable "fatimage_source_image_name" { - type = string - default = null + type = map(string) + default = { + RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" + RL9: "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + } } variable "fatimage_source_image" { - type = string - default = null + type = map(string) + default = { + RL8: null + RL9: null + } } variable "flavor" { @@ -159,38 +159,13 @@ source "openstack" "openhpc" { image_visibility = "${var.image_visibility}" } -# NB: build names, split on "-", are used to determine groups to add build to, so could build for a compute gpu group using e.g. `compute-gpu`. -build { - source "source.openstack.openhpc" { - name = "compute" - source_image = "${var.source_image}" - source_image_name = "${var.source_image_name}" # NB: must already exist in OpenStack - image_name = "ohpc-${source.name}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # also provides a unique legal instance hostname (in case of parallel packer builds) - } - - provisioner "ansible" { - playbook_file = "${var.repo_root}/ansible/site.yml" - groups = concat(["builder"], split("-", "${source.name}")) - keep_inventory_file = true # for debugging - use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting - extra_arguments = ["--limit", "builder", "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/${source.name}_extravars.yml"] - } - - post-processor "manifest" { - output = "${var.manifest_output_path}" - custom_data = { - source = "${source.name}" - } - } -} - # The "fat" image build with all binaries: build { source "source.openstack.openhpc" { floating_ip_network = "${var.floating_ip_network}" - source_image = "${var.fatimage_source_image}" - source_image_name = "${var.fatimage_source_image_name}" # NB: must already exist in OpenStack - image_name = "${source.name}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # similar to name from slurm_image_builder + source_image = "${var.fatimage_source_image[var.os_version]}" + source_image_name = "${var.fatimage_source_image_name[var.os_version]}" # NB: must already exist in OpenStack + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # similar to name from slurm_image_builder } provisioner "ansible" { From 411c59fd742dd69e34ecdb05c347bfd69cb36f50 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 14:08:04 +0000 Subject: [PATCH 18/29] get concurrent OS builds --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 8771f3dd9..7dbd95d20 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -11,8 +11,8 @@ name: Build fat image jobs: openstack: name: openstack-imagebuild - concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS runs-on: ubuntu-20.04 + concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS strategy: matrix: os_version: [RL8, RL9] From c34bff176c1d831707a4f6faa7d78ec200bd3dd1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 15:43:44 +0000 Subject: [PATCH 19/29] enable RL8 and RL9 for CI test workflow --- .github/workflows/stackhpc.yml | 28 +++++++++++++++++++++--- environments/.stackhpc/hooks/pre.yml | 2 ++ environments/.stackhpc/terraform/main.tf | 14 +++++++++--- packer/openstack.pkr.hcl | 2 +- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 3f944063c..002a28d0c 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -2,6 +2,12 @@ name: Test deployment and reimage on OpenStack on: workflow_dispatch: + inputs: + use_RL9: + required: true + description: Include RL9 tests + type: boolean + default: false push: branches: - main @@ -9,12 +15,22 @@ on: jobs: openstack: name: openstack-ci - concurrency: ${{ github.ref }} # to branch/PR - runs-on: ubuntu-20.04 + concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + strategy: + matrix: + os_version: [RL8, RL9] + rl9_selected: + - ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch + rl9_branch: + - ${{ startsWith(github.head_ref, 'rl9') == true }} # only potentially for pull_request, always false on merge + exclude: + - os_version: RL9 + rl9_selected: false + rl9_branch: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} + TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }} CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - uses: actions/checkout@v2 @@ -69,6 +85,8 @@ jobs: . environments/.stackhpc/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" + env: + TF_VAR_os_version: ${{ matrix.os_version }} - name: Delete infrastructure if provisioning failed run: | @@ -77,6 +95,8 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' + env: + TF_VAR_os_version: ${{ matrix.os_version }} - name: Configure cluster run: | @@ -175,6 +195,8 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} + env: + TF_VAR_os_version: ${{ matrix.os_version }} # - name: Delete images # run: | diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 65dfad72d..d783441f7 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -2,6 +2,8 @@ become: yes gather_facts: false tasks: + - name: Output OS version + command: /etc/redhat-release - name: Write CI-generated inventory and secrets for debugging ansible.builtin.copy: dest: /etc/ci-config/ diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4e83f0ce5..4f03ed10a 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -19,11 +19,19 @@ variable "cluster_name" { description = "Name for cluster, used as prefix for resources - set by environment var in CI" } +variable "os_version" { + type = string + description = "RL8 or RL9" +} + variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" - type = string - # default = "openhpc-240307-1635-ff0f9833" # https://github.com/stackhpc/ansible-slurm-appliance/pull/376 - default = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" # TODO: create packer build + type = map(string) + default = { + # https://github.com/stackhpc/ansible-slurm-appliance/pull/353 + RL8: "openhpc-RL8-240308-1440-411c59fd" + RL9: "openhpc-RL9-240308-1414-411c59fd" + } } variable "cluster_net" {} diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 3bc44d609..7fdc8037e 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -39,12 +39,12 @@ variable "networks" { type = list(string) } -# Must supply either fatimage_source_image_name or fatimage_source_image variable "os_version" { type = string description = "RL8 or RL9" } +# Must supply either fatimage_source_image_name or fatimage_source_image variable "fatimage_source_image_name" { type = map(string) default = { From 14205693664679744df5eafb19a828eb00398b9b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 15:49:11 +0000 Subject: [PATCH 20/29] fix stackhpc workflow --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 002a28d0c..bdb85029b 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -16,6 +16,7 @@ jobs: openstack: name: openstack-ci concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + runs-on: ubuntu-20.04 strategy: matrix: os_version: [RL8, RL9] From 900e052ff83278496560b3523288742184450698 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 17:03:04 +0000 Subject: [PATCH 21/29] fix CI image selection --- environments/.stackhpc/terraform/main.tf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4f03ed10a..6e0650862 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -25,7 +25,7 @@ variable "os_version" { } variable "cluster_image" { - description = "single image for all cluster nodes - a convenience for CI" + description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/353 @@ -68,23 +68,23 @@ module "cluster" { key_pair = "slurm-app-ci" control_node = { flavor: var.control_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } login_nodes = { login-0: { flavor: var.other_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } } compute_types = { standard: { # NB: can't call this default! flavor: var.other_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } # Example of how to add another partition: # extra: { # flavor: var.other_node_flavor - # image: var.cluster_image + # image: var.cluster_image[var.os_version] # } } compute_nodes = { From 93acdd6cc2be23e48b1ef82091513e86d29d8855 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Mar 2024 17:18:02 +0000 Subject: [PATCH 22/29] fix os debug output --- environments/.stackhpc/hooks/pre.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index d783441f7..8e6f0e25e 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -3,7 +3,7 @@ gather_facts: false tasks: - name: Output OS version - command: /etc/redhat-release + command: cat /etc/redhat-release - name: Write CI-generated inventory and secrets for debugging ansible.builtin.copy: dest: /etc/ci-config/ From 15f9ab38f9c3721494e49ca37c594fef085ce2dc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Mar 2024 10:26:31 +0000 Subject: [PATCH 23/29] fix podman systemd warnings --- ansible/roles/filebeat/templates/filebeat.service.j2 | 2 +- ansible/roles/mysql/templates/mysql.service.j2 | 2 +- ansible/roles/opensearch/templates/opensearch.service.j2 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/filebeat/templates/filebeat.service.j2 b/ansible/roles/filebeat/templates/filebeat.service.j2 index 7a3a14277..efdb26827 100644 --- a/ansible/roles/filebeat/templates/filebeat.service.j2 +++ b/ansible/roles/filebeat/templates/filebeat.service.j2 @@ -12,7 +12,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ diff --git a/ansible/roles/mysql/templates/mysql.service.j2 b/ansible/roles/mysql/templates/mysql.service.j2 index 794035a8e..657e111e9 100644 --- a/ansible/roles/mysql/templates/mysql.service.j2 +++ b/ansible/roles/mysql/templates/mysql.service.j2 @@ -14,7 +14,7 @@ EnvironmentFile=/etc/sysconfig/mysqld # The above EnvironmentFile must define MYSQL_INITIAL_ROOT_PASSWORD ExecStartPre=+install -d -o {{ mysql_podman_user }} -g {{ mysql_podman_user }} -Z container_file_t {{ mysql_datadir }} ExecStartPre=+chown -R {{ mysql_podman_user }}:{{ mysql_podman_user }} {{ mysql_datadir }} -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ diff --git a/ansible/roles/opensearch/templates/opensearch.service.j2 b/ansible/roles/opensearch/templates/opensearch.service.j2 index 2d98305eb..00dedfc7b 100644 --- a/ansible/roles/opensearch/templates/opensearch.service.j2 +++ b/ansible/roles/opensearch/templates/opensearch.service.j2 @@ -11,7 +11,7 @@ Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always # paths below based on https://opensearch.org/docs/latest/opensearch/configuration/ and https://opensearch.org/docs/latest/security-plugin/configuration/yaml # see also https://opensearch.org/docs/2.0/opensearch/install/important-settings/ -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ From 2e1972947c77dd3c5fd5aee6d59728470b768b29 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Mar 2024 13:45:55 +0000 Subject: [PATCH 24/29] bump CI image --- environments/.stackhpc/terraform/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 6e0650862..46709463f 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,8 +29,8 @@ variable "cluster_image" { type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/353 - RL8: "openhpc-RL8-240308-1440-411c59fd" - RL9: "openhpc-RL9-240308-1414-411c59fd" + RL8: "openhpc-RL8-240313-1028-15f9ab38" + RL9: "openhpc-RL9-240313-1057-15f9ab38" } } From 43d43f20c9e58d678462e6f63720f814fa3474cd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Mar 2024 15:22:20 +0000 Subject: [PATCH 25/29] bump openhpc role for release --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index c429fb5ce..3a6dbb7f6 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: 7f547d0 # TODO: bump on release + version: v0.24.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/164 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 25d8547a3a4b612d6b6c9a53681ab4a1320a28dc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Mar 2024 16:07:38 +0000 Subject: [PATCH 26/29] fix OOD app partitions --- .../.stackhpc/inventory/group_vars/openondemand/overrides.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml index f810b7ecc..735da25df 100644 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml @@ -1,6 +1,6 @@ openondemand_auth: basic_pam -openondemand_jupyter_partition: small -openondemand_desktop_partition: small +openondemand_jupyter_partition: standard +openondemand_desktop_partition: standard #openondemand_dashboard_support_url: #openondemand_dashboard_docs_url: #openondemand_filesapp_paths: From c47f0fd44533b5228acba6788cf2428470d016ba Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 14 Mar 2024 09:32:46 +0000 Subject: [PATCH 27/29] delete unused packer vars for compute image builds --- packer/compute_extravars.yml | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 packer/compute_extravars.yml diff --git a/packer/compute_extravars.yml b/packer/compute_extravars.yml deleted file mode 100644 index 2b1fe6963..000000000 --- a/packer/compute_extravars.yml +++ /dev/null @@ -1,3 +0,0 @@ -# Used to override anything defined in a concrete environment -update_enable: false -openhpc_slurm_partitions: [] # as no compute nodes will be in play, but partition definition might exist in inventory From ccb7cf8932b45983d977cd0355041c3ddce78d28 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 14 Mar 2024 09:34:45 +0000 Subject: [PATCH 28/29] update packer README to remove compute image builds --- packer/README.md | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/packer/README.md b/packer/README.md index 1cd14c292..c2a754e5d 100644 --- a/packer/README.md +++ b/packer/README.md @@ -1,27 +1,24 @@ # Packer-based image build -The appliance contains code and configuration to use Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. Two types of images can be built: +The appliance contains code and configuration to use Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -1. A "fat" image, containing binaries for all nodes, but no configuration. By default, this is done in StackHPC's CI workflow and the image made available to clients. The fat image is intended to be used as the base image for a cluster. This: - - Ensures the cluster is using binaries which have been tested in CI. - - Ensures deployment and further image builds can be completed even if packages are changed in upstream repositories (e.g. due to Rocky Linux or OpenHPC updates). - - Reduces the number of package downloads to improve deployment speed. +The image built is referred to as a "fat" image as it contains binaries for all nodes, but no configuration. Using a "fat" image: +- Enables the image to be tested in CI before production use. +- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). +- Improves deployment speed by reducing the number of package downloads to improve deployment speed. - This build starts from a RockyLinux GenericCloud image and runs yum update. +A default fat image is built in StackHPC's CI workflow and made available to clients. However it is possible to build site-specific fat images if required. -2. An environment-specific compute node image, which additionally contains all configuration etc. to allow an instance booted with such an image to join a cluster. This allows Slurm to be used to reimage compute nodes for upgrades, see [stackhpc.slurm_openstack_tools.rebuild/README.md](../ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md). This build starts from a "fat" image and does not run yum update. +A fat image build starts from a RockyLinux GenericCloud image and (by default) updates all dnf packages in that image. # Build Process - -Building an environment-specific compute node image will[^1] require a cluster to be provisioned to complete the Ansible host/group variables in inventory for the environment. - - Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). - Create a file `environments//builder.pkrvars.hcl` containing at a minimum e.g.: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to - source_image_name = "Rocky-8.5-GenericCloud" # Name of source image. This must exist in OpenStack and should be a Rocky Linux 8.5 GenericCloud-based image. + source_image_name = "Rocky-8.9-GenericCloud" # Name of source image. This must exist in OpenStack and should be a Rocky Linux GenericCloud-based image. ``` This configuration will generate and use an ephemeral SSH key for communicating with the Packer VM. If this is undesirable, set `ssh_keypair_name` to the name of an existing keypair in OpenStack. The private key must be on the host running Packer, and its path can be set using `ssh_private_key_file`. @@ -31,32 +28,20 @@ Building an environment-specific compute node image will[^1] require a cluster t For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - Activate the venv and the relevant environment. -- Ensure you have generated passwords using: - - ansible-playbook ansible/adhoc/generate-passwords.yml -- Ensure you have the private part of the keypair `ssh_keypair_name` at `~/.ssh/id_rsa.pub` (or set variable `ssh_private_key_file` in `builder.pkrvars.hcl`). - -- Build images using the variable definition file: +- Build images using the relevant variable definition file: cd packer - PACKER_LOG=1 /usr/bin/packer build -except openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note the builder VMs are added to the `builder` group to differentiate them from "real" nodes - see developer notes below. + PACKER_LOG=1 /usr/bin/packer build -only openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl -- The built image will be automatically uploaded to OpenStack with a name prefixed `ohpc-` and including a timestamp and a shortened git hash. + Note the build VM is added to the `builder` group to differentiate them from "real" nodes - see developer notes below. -[^1]: With the default Terraform at least. +- The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. # Notes for developers -Packer build VMs are added to both the `builder` group and other groups (e.g. `compute`) as appropriate. The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started. +Packer build VMs are added to both the `builder` group and the other top-level groups (e.g. `control`, `compute`, etc.). The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started. Note that hostnames in the Packer VMs are not the same as the equivalent "real" hosts. Therefore variables required inside a Packer VM must be defined as group vars, not hostvars. Ansible may need to proxy to compute nodes. If the Packer build should not use the same proxy to connect to the builder VMs, note that proxy configuration should not be added to the `all` group. - -When using appliance defaults and an environment with an `inventory/groups` file matching `environments/common/layouts/everything` (as used by cookiecutter for new environment creation), the following inventory variables must be defined when running Packer builds: -- `openhpc_cluster_name` -- `openondemand_servername` -- `inventory_hostname` for a host in the `control` group (provides `openhpc_slurm_control_host` and `nfs_server`) From 41fc84a8a8cac7e1ef5a45d62062db16b2e86472 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 14 Mar 2024 11:28:00 +0000 Subject: [PATCH 29/29] bump os-manila role after release --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 3a6dbb7f6..88e5c62d7 100644 --- a/requirements.yml +++ b/requirements.yml @@ -22,7 +22,7 @@ roles: version: v3.0.6 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: feat/RL9 # TODO: bump to release + version: v24.2.0 # Support RockyLinux 9 collections: - name: containers.podman