diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 6c33c6ee9..7dbd95d20 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,12 +1,26 @@ name: Build fat image -on: +'on': workflow_dispatch: + inputs: + use_RL9: + required: true + description: Include RL9 image build + type: boolean + default: false jobs: openstack: name: openstack-imagebuild - concurrency: ${{ github.ref }} # to branch/PR runs-on: ubuntu-20.04 + concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + strategy: + matrix: + os_version: [RL8, RL9] + rl9_selected: + - ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch + exclude: + - os_version: RL9 + rl9_selected: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -48,6 +62,8 @@ jobs: cd packer/ packer init . PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl + env: + PKR_VAR_os_version: ${{ matrix.os_version }} - name: Get created image name from manifest id: manifest diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 3f944063c..bdb85029b 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -2,6 +2,12 @@ name: Test deployment and reimage on OpenStack on: workflow_dispatch: + inputs: + use_RL9: + required: true + description: Include RL9 tests + type: boolean + default: false push: branches: - main @@ -9,12 +15,23 @@ on: jobs: openstack: name: openstack-ci - concurrency: ${{ github.ref }} # to branch/PR + concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS runs-on: ubuntu-20.04 + strategy: + matrix: + os_version: [RL8, RL9] + rl9_selected: + - ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch + rl9_branch: + - ${{ startsWith(github.head_ref, 'rl9') == true }} # only potentially for pull_request, always false on merge + exclude: + - os_version: RL9 + rl9_selected: false + rl9_branch: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} + TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }} CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - uses: actions/checkout@v2 @@ -69,6 +86,8 @@ jobs: . environments/.stackhpc/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" + env: + TF_VAR_os_version: ${{ matrix.os_version }} - name: Delete infrastructure if provisioning failed run: | @@ -77,6 +96,8 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' + env: + TF_VAR_os_version: ${{ matrix.os_version }} - name: Configure cluster run: | @@ -175,6 +196,8 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} + env: + TF_VAR_os_version: ${{ matrix.os_version }} # - name: Delete images # run: | diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 9b6fda0de..ea56d1c9b 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -40,7 +40,13 @@ - hosts: cluster gather_facts: false tasks: - - name: Add groups + - name: Prevent ssh hanging if shared home is unavailable + lineinfile: + path: /etc/profile + search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ + state: absent + become: yes + - name: Add system user groups ansible.builtin.group: "{{ item.group }}" loop: "{{ appliances_local_users }}" when: @@ -50,7 +56,7 @@ # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: '-i' become: true - - name: Add users + - name: Add system users ansible.builtin.user: "{{ item.user }}" loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 658d24d44..0764477b3 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -49,7 +49,7 @@ - hosts: builder become: yes - gather_facts: no + gather_facts: yes tasks: # - import_playbook: slurm.yml: - name: Setup DB @@ -61,15 +61,13 @@ name: stackhpc.openhpc tasks_from: install.yml - - name: Include distribution variables for osc.ood - include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml" - # FUTURE: install-apps.yml - this is git clones - # - import_playbook: portal.yml - name: Open Ondemand server - import_role: + include_role: name: osc.ood tasks_from: install-package.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" + # # FUTURE: install-apps.yml - this is git clones - name: Open Ondemand remote desktop import_role: name: openondemand diff --git a/ansible/roles/filebeat/templates/filebeat.service.j2 b/ansible/roles/filebeat/templates/filebeat.service.j2 index 7a3a14277..efdb26827 100644 --- a/ansible/roles/filebeat/templates/filebeat.service.j2 +++ b/ansible/roles/filebeat/templates/filebeat.service.j2 @@ -12,7 +12,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 4427b7d18..4ed5d30ba 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -1,6 +1,12 @@ +- name: Install pip + dnf: + name: python3-pip + - name: Install python mysql client pip: - name: pymysql + name: + - pymysql + - cryptography state: present - name: Create systemd mysql container unit file @@ -11,6 +17,6 @@ - name: Pull container image containers.podman.podman_image: - name: "mysql" + name: docker.io/library/mysql tag: "{{ mysql_tag }}" become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/mysql/templates/mysql.service.j2 b/ansible/roles/mysql/templates/mysql.service.j2 index 3b531cd3f..657e111e9 100644 --- a/ansible/roles/mysql/templates/mysql.service.j2 +++ b/ansible/roles/mysql/templates/mysql.service.j2 @@ -14,7 +14,7 @@ EnvironmentFile=/etc/sysconfig/mysqld # The above EnvironmentFile must define MYSQL_INITIAL_ROOT_PASSWORD ExecStartPre=+install -d -o {{ mysql_podman_user }} -g {{ mysql_podman_user }} -Z container_file_t {{ mysql_datadir }} ExecStartPre=+chown -R {{ mysql_podman_user }}:{{ mysql_podman_user }} {{ mysql_datadir }} -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ @@ -26,7 +26,7 @@ ExecStart=/usr/bin/podman run \ --volume {{ mysql_datadir }}:/var/lib/mysql:U \ --publish 3306:3306 \ --env MYSQL_ROOT_PASSWORD=${MYSQL_INITIAL_ROOT_PASSWORD} \ - mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ + docker.io/library/mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \ --{{ opt }}{% endfor %} ExecStop=/usr/bin/podman stop --ignore mysql -t 10 diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 34e1ac223..86184f13c 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -10,7 +10,7 @@ - include_role: name: osc.ood tasks_from: install-package.yml - vars_from: Rocky/8.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" public: yes # Expose the vars from this role to the rest of the play # can't set vars: from a dict hence the workaround above diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index bde13c383..5f403bf86 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -17,6 +17,7 @@ - turbovnc-3.0.1 - nmap-ncat - python3.9 + - dbus-x11 - name: Install Xfce desktop tags: install diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 81547e5a0..9a0ffd361 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -16,7 +16,7 @@ - name: Pull container image containers.podman.podman_image: - name: "opensearchproject/opensearch" + name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" become_user: "{{ opensearch_podman_user }}" diff --git a/ansible/roles/opensearch/templates/opensearch.service.j2 b/ansible/roles/opensearch/templates/opensearch.service.j2 index 6951bafc0..00dedfc7b 100644 --- a/ansible/roles/opensearch/templates/opensearch.service.j2 +++ b/ansible/roles/opensearch/templates/opensearch.service.j2 @@ -11,7 +11,7 @@ Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always # paths below based on https://opensearch.org/docs/latest/opensearch/configuration/ and https://opensearch.org/docs/latest/security-plugin/configuration/yaml # see also https://opensearch.org/docs/2.0/opensearch/install/important-settings/ -ExecStart=/usr/bin/podman run \ +ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \ --network=host \ --sdnotify=conmon \ --cgroups=no-conmon \ @@ -29,7 +29,7 @@ ExecStart=/usr/bin/podman run \ --env bootstrap.memory_lock=true \ --env "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ --env DISABLE_INSTALL_DEMO_CONFIG=true \ - opensearchproject/opensearch:{{ opensearch_version }} + docker.io/opensearchproject/opensearch:{{ opensearch_version }} ExecStop=/usr/bin/podman stop --ignore opensearch -t 10 # note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped ExecStopPost=/usr/bin/podman rm --ignore -f opensearch diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 2b1bbfb39..738a021c0 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -1,10 +1,8 @@ flavor = "vm.ska.cpu.general.small" use_blockstorage_volume = true -volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny +volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) -source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index bb778b0ae..1e2b6d3e2 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -4,8 +4,6 @@ volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny volume_type = "unencrypted" image_disk_format = "qcow2" networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci -source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/hooks/post-bootstrap.yml b/environments/.stackhpc/hooks/post-bootstrap.yml index fe783e469..df3902698 100644 --- a/environments/.stackhpc/hooks/post-bootstrap.yml +++ b/environments/.stackhpc/hooks/post-bootstrap.yml @@ -3,14 +3,15 @@ gather_facts: false tags: podman tasks: - - name: Configure container image registry for unqualified searches to avoid docker.io ratelimits + - name: Configure container image registry to avoid docker.io ratelimits copy: - dest: /etc/containers/registries.conf.d/003-arcus-unqualfied-overrides.conf + dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf content: | - unqualified-search-registries = ['{{ podman_registry_address | split('/') | first }}', 'registry.access.redhat.com', 'registry.redhat.io', 'docker.io'] - [[registry]] - prefix = "{{ podman_registry_address }}" + location="docker.io/library/" + prefix="docker.io/library/" + + [[registry.mirror]] location = "{{ podman_registry_address }}" insecure = true when: "ci_cloud == 'ARCUS'" diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 65dfad72d..8e6f0e25e 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -2,6 +2,8 @@ become: yes gather_facts: false tasks: + - name: Output OS version + command: cat /etc/redhat-release - name: Write CI-generated inventory and secrets for debugging ansible.builtin.copy: dest: /etc/ci-config/ diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml index f810b7ecc..735da25df 100644 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml @@ -1,6 +1,6 @@ openondemand_auth: basic_pam -openondemand_jupyter_partition: small -openondemand_desktop_partition: small +openondemand_jupyter_partition: standard +openondemand_desktop_partition: standard #openondemand_dashboard_support_url: #openondemand_dashboard_docs_url: #openondemand_filesapp_paths: diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 9eaf3dccd..46709463f 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -19,11 +19,19 @@ variable "cluster_name" { description = "Name for cluster, used as prefix for resources - set by environment var in CI" } +variable "os_version" { + type = string + description = "RL8 or RL9" +} + variable "cluster_image" { - description = "single image for all cluster nodes - a convenience for CI" - type = string - default = "openhpc-240308-1011-0f0291c0" # https://github.com/stackhpc/ansible-slurm-appliance/pull/364 - # default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" + description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" + type = map(string) + default = { + # https://github.com/stackhpc/ansible-slurm-appliance/pull/353 + RL8: "openhpc-RL8-240313-1028-15f9ab38" + RL9: "openhpc-RL9-240313-1057-15f9ab38" + } } variable "cluster_net" {} @@ -60,23 +68,23 @@ module "cluster" { key_pair = "slurm-app-ci" control_node = { flavor: var.control_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } login_nodes = { login-0: { flavor: var.other_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } } compute_types = { standard: { # NB: can't call this default! flavor: var.other_node_flavor - image: var.cluster_image + image: var.cluster_image[var.os_version] } # Example of how to add another partition: # extra: { # flavor: var.other_node_flavor - # image: var.cluster_image + # image: var.cluster_image[var.os_version] # } } compute_nodes = { diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 23448c80d..91db4dc3a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -58,6 +58,7 @@ appliances_local_users_default: uid: 981 home: "{{ prometheus_db_dir }}" shell: /usr/sbin/nologin + system: true enable: "{{ 'prometheus' in group_names }}" - group: @@ -69,6 +70,7 @@ appliances_local_users_default: uid: 984 home: /usr/share/grafana shell: /sbin/nologin + system: true enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index b7bdfdabc..18e741ce7 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -49,6 +49,8 @@ openondemand_clusters: module purge export PATH=/opt/TurboVNC/bin:$PATH + # avoid "Failed to create secure directory (/run/user/*/pulse)" + export XDG_RUNTIME_DIR="$TMPDIR/xdg_runtime" # Workaround to avoid "Unable to contact settings server" when # lauching xfce4-session diff --git a/packer/README.md b/packer/README.md index 1cd14c292..c2a754e5d 100644 --- a/packer/README.md +++ b/packer/README.md @@ -1,27 +1,24 @@ # Packer-based image build -The appliance contains code and configuration to use Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. Two types of images can be built: +The appliance contains code and configuration to use Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -1. A "fat" image, containing binaries for all nodes, but no configuration. By default, this is done in StackHPC's CI workflow and the image made available to clients. The fat image is intended to be used as the base image for a cluster. This: - - Ensures the cluster is using binaries which have been tested in CI. - - Ensures deployment and further image builds can be completed even if packages are changed in upstream repositories (e.g. due to Rocky Linux or OpenHPC updates). - - Reduces the number of package downloads to improve deployment speed. +The image built is referred to as a "fat" image as it contains binaries for all nodes, but no configuration. Using a "fat" image: +- Enables the image to be tested in CI before production use. +- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). +- Improves deployment speed by reducing the number of package downloads to improve deployment speed. - This build starts from a RockyLinux GenericCloud image and runs yum update. +A default fat image is built in StackHPC's CI workflow and made available to clients. However it is possible to build site-specific fat images if required. -2. An environment-specific compute node image, which additionally contains all configuration etc. to allow an instance booted with such an image to join a cluster. This allows Slurm to be used to reimage compute nodes for upgrades, see [stackhpc.slurm_openstack_tools.rebuild/README.md](../ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md). This build starts from a "fat" image and does not run yum update. +A fat image build starts from a RockyLinux GenericCloud image and (by default) updates all dnf packages in that image. # Build Process - -Building an environment-specific compute node image will[^1] require a cluster to be provisioned to complete the Ansible host/group variables in inventory for the environment. - - Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). - Create a file `environments//builder.pkrvars.hcl` containing at a minimum e.g.: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to - source_image_name = "Rocky-8.5-GenericCloud" # Name of source image. This must exist in OpenStack and should be a Rocky Linux 8.5 GenericCloud-based image. + source_image_name = "Rocky-8.9-GenericCloud" # Name of source image. This must exist in OpenStack and should be a Rocky Linux GenericCloud-based image. ``` This configuration will generate and use an ephemeral SSH key for communicating with the Packer VM. If this is undesirable, set `ssh_keypair_name` to the name of an existing keypair in OpenStack. The private key must be on the host running Packer, and its path can be set using `ssh_private_key_file`. @@ -31,32 +28,20 @@ Building an environment-specific compute node image will[^1] require a cluster t For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - Activate the venv and the relevant environment. -- Ensure you have generated passwords using: - - ansible-playbook ansible/adhoc/generate-passwords.yml -- Ensure you have the private part of the keypair `ssh_keypair_name` at `~/.ssh/id_rsa.pub` (or set variable `ssh_private_key_file` in `builder.pkrvars.hcl`). - -- Build images using the variable definition file: +- Build images using the relevant variable definition file: cd packer - PACKER_LOG=1 /usr/bin/packer build -except openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note the builder VMs are added to the `builder` group to differentiate them from "real" nodes - see developer notes below. + PACKER_LOG=1 /usr/bin/packer build -only openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl -- The built image will be automatically uploaded to OpenStack with a name prefixed `ohpc-` and including a timestamp and a shortened git hash. + Note the build VM is added to the `builder` group to differentiate them from "real" nodes - see developer notes below. -[^1]: With the default Terraform at least. +- The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. # Notes for developers -Packer build VMs are added to both the `builder` group and other groups (e.g. `compute`) as appropriate. The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started. +Packer build VMs are added to both the `builder` group and the other top-level groups (e.g. `control`, `compute`, etc.). The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started. Note that hostnames in the Packer VMs are not the same as the equivalent "real" hosts. Therefore variables required inside a Packer VM must be defined as group vars, not hostvars. Ansible may need to proxy to compute nodes. If the Packer build should not use the same proxy to connect to the builder VMs, note that proxy configuration should not be added to the `all` group. - -When using appliance defaults and an environment with an `inventory/groups` file matching `environments/common/layouts/everything` (as used by cookiecutter for new environment creation), the following inventory variables must be defined when running Packer builds: -- `openhpc_cluster_name` -- `openondemand_servername` -- `inventory_hostname` for a host in the `control` group (provides `openhpc_slurm_control_host` and `nfs_server`) diff --git a/packer/compute_extravars.yml b/packer/compute_extravars.yml deleted file mode 100644 index 2b1fe6963..000000000 --- a/packer/compute_extravars.yml +++ /dev/null @@ -1,3 +0,0 @@ -# Used to override anything defined in a concrete environment -update_enable: false -openhpc_slurm_partitions: [] # as no compute nodes will be in play, but partition definition might exist in inventory diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index d3a0283d7..7fdc8037e 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -39,26 +39,26 @@ variable "networks" { type = list(string) } -# Must supply either source_image_name or source_image -variable "source_image_name" { +variable "os_version" { type = string - default = null -} - -variable "source_image" { - type = string - default = null + description = "RL8 or RL9" } # Must supply either fatimage_source_image_name or fatimage_source_image variable "fatimage_source_image_name" { - type = string - default = null + type = map(string) + default = { + RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" + RL9: "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + } } variable "fatimage_source_image" { - type = string - default = null + type = map(string) + default = { + RL8: null + RL9: null + } } variable "flavor" { @@ -159,38 +159,13 @@ source "openstack" "openhpc" { image_visibility = "${var.image_visibility}" } -# NB: build names, split on "-", are used to determine groups to add build to, so could build for a compute gpu group using e.g. `compute-gpu`. -build { - source "source.openstack.openhpc" { - name = "compute" - source_image = "${var.source_image}" - source_image_name = "${var.source_image_name}" # NB: must already exist in OpenStack - image_name = "ohpc-${source.name}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # also provides a unique legal instance hostname (in case of parallel packer builds) - } - - provisioner "ansible" { - playbook_file = "${var.repo_root}/ansible/site.yml" - groups = concat(["builder"], split("-", "${source.name}")) - keep_inventory_file = true # for debugging - use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting - extra_arguments = ["--limit", "builder", "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/${source.name}_extravars.yml"] - } - - post-processor "manifest" { - output = "${var.manifest_output_path}" - custom_data = { - source = "${source.name}" - } - } -} - # The "fat" image build with all binaries: build { source "source.openstack.openhpc" { floating_ip_network = "${var.floating_ip_network}" - source_image = "${var.fatimage_source_image}" - source_image_name = "${var.fatimage_source_image_name}" # NB: must already exist in OpenStack - image_name = "${source.name}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # similar to name from slurm_image_builder + source_image = "${var.fatimage_source_image[var.os_version]}" + source_image_name = "${var.fatimage_source_image_name[var.os_version]}" # NB: must already exist in OpenStack + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" # similar to name from slurm_image_builder } provisioner "ansible" { diff --git a/requirements.yml b/requirements.yml index 124759d20..88e5c62d7 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.23.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/165 + version: v0.24.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/164 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc @@ -22,7 +22,7 @@ roles: version: v3.0.6 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.1.0 + version: v24.2.0 # Support RockyLinux 9 collections: - name: containers.podman