Skip to content

Commit

Permalink
Add support for RockyLinux9 (#353)
Browse files Browse the repository at this point in the history
* add changes from branch rl9

* fix unqualified names for container pulls

* fix openondemand install

* bugfix filebeat unit reload

* comment on required image

* bump fat image base to RL9.3

* fix dbus-launch command for OOD desktop

* fix OOD desktop launch

* fix useradd warning: {grafana,prometheus}'s uid * outside of the UID_MIN 1000 and UID_MAX 60000 range.

* fix manila support in RL9

* bump openhpc role after merge

* downsize arcus control node to try to avoid CI failures

* prevent ssh hanging after NFS server reimaged in CI

* fix 'prevent ssh hanging after NFS server reimaged in CI'

* fix import of osc GPG key for RL9

* make fatimage more like openondemand role

* enable fatimage build on either RL8 or RL9, inc on PRs

* get concurrent OS builds

* enable RL8 and RL9 for CI test workflow

* fix stackhpc workflow

* fix CI image selection

* fix os debug output

* fix podman systemd warnings

* bump CI image

* bump openhpc role for release

* fix OOD app partitions

* delete unused packer vars for compute image builds

* update packer README to remove compute image builds

* bump os-manila role after release
  • Loading branch information
sjpb authored Mar 14, 2024
1 parent 678d08d commit a415036
Show file tree
Hide file tree
Showing 23 changed files with 132 additions and 114 deletions.
20 changes: 18 additions & 2 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@

name: Build fat image
on:
'on':
workflow_dispatch:
inputs:
use_RL9:
required: true
description: Include RL9 image build
type: boolean
default: false
jobs:
openstack:
name: openstack-imagebuild
concurrency: ${{ github.ref }} # to branch/PR
runs-on: ubuntu-20.04
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
strategy:
matrix:
os_version: [RL8, RL9]
rl9_selected:
- ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch
exclude:
- os_version: RL9
rl9_selected: false
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
Expand Down Expand Up @@ -48,6 +62,8 @@ jobs:
cd packer/
packer init .
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}

- name: Get created image name from manifest
id: manifest
Expand Down
27 changes: 25 additions & 2 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,36 @@
name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
inputs:
use_RL9:
required: true
description: Include RL9 tests
type: boolean
default: false
push:
branches:
- main
pull_request:
jobs:
openstack:
name: openstack-ci
concurrency: ${{ github.ref }} # to branch/PR
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
runs-on: ubuntu-20.04
strategy:
matrix:
os_version: [RL8, RL9]
rl9_selected:
- ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch
rl9_branch:
- ${{ startsWith(github.head_ref, 'rl9') == true }} # only potentially for pull_request, always false on merge
exclude:
- os_version: RL9
rl9_selected: false
rl9_branch: false
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: ci${{ github.run_id }}
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }}
CI_CLOUD: ${{ vars.CI_CLOUD }}
steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -69,6 +86,8 @@ jobs:
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
env:
TF_VAR_os_version: ${{ matrix.os_version }}

- name: Delete infrastructure if provisioning failed
run: |
Expand All @@ -77,6 +96,8 @@ jobs:
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
env:
TF_VAR_os_version: ${{ matrix.os_version }}

- name: Configure cluster
run: |
Expand Down Expand Up @@ -175,6 +196,8 @@ jobs:
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}
env:
TF_VAR_os_version: ${{ matrix.os_version }}

# - name: Delete images
# run: |
Expand Down
10 changes: 8 additions & 2 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@
- hosts: cluster
gather_facts: false
tasks:
- name: Add groups
- name: Prevent ssh hanging if shared home is unavailable
lineinfile:
path: /etc/profile
search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \
state: absent
become: yes
- name: Add system user groups
ansible.builtin.group: "{{ item.group }}"
loop: "{{ appliances_local_users }}"
when:
Expand All @@ -50,7 +56,7 @@
# Need to change working directory otherwise we try to switch back to non-existent directory.
become_flags: '-i'
become: true
- name: Add users
- name: Add system users
ansible.builtin.user: "{{ item.user }}"
loop: "{{ appliances_local_users }}"
when: item.enable | default(true) | bool
Expand Down
10 changes: 4 additions & 6 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@

- hosts: builder
become: yes
gather_facts: no
gather_facts: yes
tasks:
# - import_playbook: slurm.yml:
- name: Setup DB
Expand All @@ -61,15 +61,13 @@
name: stackhpc.openhpc
tasks_from: install.yml

- name: Include distribution variables for osc.ood
include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml"
# FUTURE: install-apps.yml - this is git clones

# - import_playbook: portal.yml
- name: Open Ondemand server
import_role:
include_role:
name: osc.ood
tasks_from: install-package.yml
vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
# # FUTURE: install-apps.yml - this is git clones
- name: Open Ondemand remote desktop
import_role:
name: openondemand
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/filebeat/templates/filebeat.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ After=network-online.target
[Service]
Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=always
ExecStart=/usr/bin/podman run \
ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \
--network=host \
--sdnotify=conmon \
--cgroups=no-conmon \
Expand Down
10 changes: 8 additions & 2 deletions ansible/roles/mysql/tasks/install.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
- name: Install pip
dnf:
name: python3-pip

- name: Install python mysql client
pip:
name: pymysql
name:
- pymysql
- cryptography
state: present

- name: Create systemd mysql container unit file
Expand All @@ -11,6 +17,6 @@

- name: Pull container image
containers.podman.podman_image:
name: "mysql"
name: docker.io/library/mysql
tag: "{{ mysql_tag }}"
become_user: "{{ mysql_podman_user }}"
4 changes: 2 additions & 2 deletions ansible/roles/mysql/templates/mysql.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ EnvironmentFile=/etc/sysconfig/mysqld
# The above EnvironmentFile must define MYSQL_INITIAL_ROOT_PASSWORD
ExecStartPre=+install -d -o {{ mysql_podman_user }} -g {{ mysql_podman_user }} -Z container_file_t {{ mysql_datadir }}
ExecStartPre=+chown -R {{ mysql_podman_user }}:{{ mysql_podman_user }} {{ mysql_datadir }}
ExecStart=/usr/bin/podman run \
ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \
--network=host \
--sdnotify=conmon \
--cgroups=no-conmon \
Expand All @@ -26,7 +26,7 @@ ExecStart=/usr/bin/podman run \
--volume {{ mysql_datadir }}:/var/lib/mysql:U \
--publish 3306:3306 \
--env MYSQL_ROOT_PASSWORD=${MYSQL_INITIAL_ROOT_PASSWORD} \
mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \
docker.io/library/mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \
--{{ opt }}{% endfor %}

ExecStop=/usr/bin/podman stop --ignore mysql -t 10
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/openondemand/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
- include_role:
name: osc.ood
tasks_from: install-package.yml
vars_from: Rocky/8.yml
vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
public: yes # Expose the vars from this role to the rest of the play
# can't set vars: from a dict hence the workaround above

Expand Down
1 change: 1 addition & 0 deletions ansible/roles/openondemand/tasks/vnc_compute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- turbovnc-3.0.1
- nmap-ncat
- python3.9
- dbus-x11

- name: Install Xfce desktop
tags: install
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/opensearch/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

- name: Pull container image
containers.podman.podman_image:
name: "opensearchproject/opensearch"
name: docker.io/opensearchproject/opensearch
tag: "{{ opensearch_version }}"
become_user: "{{ opensearch_podman_user }}"

Expand Down
4 changes: 2 additions & 2 deletions ansible/roles/opensearch/templates/opensearch.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=always
# paths below based on https://opensearch.org/docs/latest/opensearch/configuration/ and https://opensearch.org/docs/latest/security-plugin/configuration/yaml
# see also https://opensearch.org/docs/2.0/opensearch/install/important-settings/
ExecStart=/usr/bin/podman run \
ExecStart=/usr/bin/podman --cgroup-manager=cgroupfs run \
--network=host \
--sdnotify=conmon \
--cgroups=no-conmon \
Expand All @@ -29,7 +29,7 @@ ExecStart=/usr/bin/podman run \
--env bootstrap.memory_lock=true \
--env "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \
--env DISABLE_INSTALL_DEMO_CONFIG=true \
opensearchproject/opensearch:{{ opensearch_version }}
docker.io/opensearchproject/opensearch:{{ opensearch_version }}
ExecStop=/usr/bin/podman stop --ignore opensearch -t 10
# note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped
ExecStopPost=/usr/bin/podman rm --ignore -f opensearch
Expand Down
4 changes: 1 addition & 3 deletions environments/.stackhpc/ARCUS.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
flavor = "vm.ska.cpu.general.small"
use_blockstorage_volume = true
volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
image_disk_format = "qcow2"
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
security_groups = ["default", "SSH"]
Expand Down
2 changes: 0 additions & 2 deletions environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
volume_type = "unencrypted"
image_disk_format = "qcow2"
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
security_groups = ["default", "SSH"]
Expand Down
11 changes: 6 additions & 5 deletions environments/.stackhpc/hooks/post-bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
gather_facts: false
tags: podman
tasks:
- name: Configure container image registry for unqualified searches to avoid docker.io ratelimits
- name: Configure container image registry to avoid docker.io ratelimits
copy:
dest: /etc/containers/registries.conf.d/003-arcus-unqualfied-overrides.conf
dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf
content: |
unqualified-search-registries = ['{{ podman_registry_address | split('/') | first }}', 'registry.access.redhat.com', 'registry.redhat.io', 'docker.io']
[[registry]]
prefix = "{{ podman_registry_address }}"
location="docker.io/library/"
prefix="docker.io/library/"
[[registry.mirror]]
location = "{{ podman_registry_address }}"
insecure = true
when: "ci_cloud == 'ARCUS'"
2 changes: 2 additions & 0 deletions environments/.stackhpc/hooks/pre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
become: yes
gather_facts: false
tasks:
- name: Output OS version
command: cat /etc/redhat-release
- name: Write CI-generated inventory and secrets for debugging
ansible.builtin.copy:
dest: /etc/ci-config/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
openondemand_auth: basic_pam
openondemand_jupyter_partition: small
openondemand_desktop_partition: small
openondemand_jupyter_partition: standard
openondemand_desktop_partition: standard
#openondemand_dashboard_support_url:
#openondemand_dashboard_docs_url:
#openondemand_filesapp_paths:
24 changes: 16 additions & 8 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,19 @@ variable "cluster_name" {
description = "Name for cluster, used as prefix for resources - set by environment var in CI"
}

variable "os_version" {
type = string
description = "RL8 or RL9"
}

variable "cluster_image" {
description = "single image for all cluster nodes - a convenience for CI"
type = string
default = "openhpc-240308-1011-0f0291c0" # https://github.com/stackhpc/ansible-slurm-appliance/pull/364
# default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
type = map(string)
default = {
# https://github.com/stackhpc/ansible-slurm-appliance/pull/353
RL8: "openhpc-RL8-240313-1028-15f9ab38"
RL9: "openhpc-RL9-240313-1057-15f9ab38"
}
}

variable "cluster_net" {}
Expand Down Expand Up @@ -60,23 +68,23 @@ module "cluster" {
key_pair = "slurm-app-ci"
control_node = {
flavor: var.control_node_flavor
image: var.cluster_image
image: var.cluster_image[var.os_version]
}
login_nodes = {
login-0: {
flavor: var.other_node_flavor
image: var.cluster_image
image: var.cluster_image[var.os_version]
}
}
compute_types = {
standard: { # NB: can't call this default!
flavor: var.other_node_flavor
image: var.cluster_image
image: var.cluster_image[var.os_version]
}
# Example of how to add another partition:
# extra: {
# flavor: var.other_node_flavor
# image: var.cluster_image
# image: var.cluster_image[var.os_version]
# }
}
compute_nodes = {
Expand Down
2 changes: 2 additions & 0 deletions environments/common/inventory/group_vars/all/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ appliances_local_users_default:
uid: 981
home: "{{ prometheus_db_dir }}"
shell: /usr/sbin/nologin
system: true
enable: "{{ 'prometheus' in group_names }}"

- group:
Expand All @@ -69,6 +70,7 @@ appliances_local_users_default:
uid: 984
home: /usr/share/grafana
shell: /sbin/nologin
system: true
enable: "{{ 'grafana' in group_names }}"

# Overide this to add extra users whilst keeping the defaults.
Expand Down
2 changes: 2 additions & 0 deletions environments/common/inventory/group_vars/all/openondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ openondemand_clusters:
module purge
export PATH=/opt/TurboVNC/bin:$PATH
# avoid "Failed to create secure directory (/run/user/*/pulse)"
export XDG_RUNTIME_DIR="$TMPDIR/xdg_runtime"
# Workaround to avoid "Unable to contact settings server" when
# lauching xfce4-session
Expand Down
Loading

0 comments on commit a415036

Please sign in to comment.