From 8c07858d175e5c5abc3213c8adba8f45ec361a1e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 13:57:25 +0000 Subject: [PATCH 01/48] update requirements.yml from caas@59d1b299f47404ca50283b31bf2508830052c3bf --- requirements.yml | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/requirements.yml b/requirements.yml index 3f65a27ac..1f762e2ad 100644 --- a/requirements.yml +++ b/requirements.yml @@ -6,7 +6,7 @@ roles: version: v0.23.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/165 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git - version: feature/no-install + version: stackhpc name: cloudalchemy.node_exporter - src: https://github.com/cloudalchemy/ansible-prometheus.git version: 4d2c8d742de39e50387e0aa6d5510b21c7451343 # need fix in preceeding commit for rocky @@ -22,9 +22,26 @@ roles: version: v3.0.6 collections: -- name: containers.podman -- name: community.grafana -- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools - type: git - version: v0.2.0 + - name: containers.podman + version: 1.10.2 + - name: community.grafana + version: 1.5.4 + - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools + type: git + version: v0.2.0 + - name: ansible.posix + version: 1.5.4 + - name: ansible.netcommon + version: 5.1.1 + - name: community.general + version: 7.1.0 + - name: community.crypto + version: 2.10.0 + - name: community.mysql + version: 3.7.2 + - name: openstack.cloud + version: 2.1.0 + - name: https://github.com/stackhpc/ansible-collection-terraform + type: git + version: 1a8f5af0239de2bfedb37f51e20d973e05699b8a # main @ 20230627 ... From cef34671c11f2206899f4401e334e7cce8a2e63f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:04:15 +0000 Subject: [PATCH 02/48] add roles from caas@59d1b299f47404ca50283b31bf2508830052c3bf --- ansible/roles/cluster_infra/defaults/main.yml | 60 +++ ansible/roles/cluster_infra/tasks/main.yml | 138 ++++++ .../cluster_infra/templates/outputs.tf.j2 | 53 ++ .../cluster_infra/templates/providers.tf.j2 | 10 + .../cluster_infra/templates/resources.tf.j2 | 452 ++++++++++++++++++ ansible/roles/image_build/defaults/main.yml | 38 ++ ansible/roles/image_build/tasks/main.yml | 84 ++++ ansible/roles/image_build/tasks/prechecks.yml | 22 + .../image_build/templates/ansible.cfg.j2 | 15 + .../templates/builder.pkrvars.hcl.j2 | 38 ++ .../roles/image_build_infra/defaults/main.yml | 12 + .../roles/image_build_infra/tasks/main.yml | 45 ++ .../image_build_infra/templates/outputs.tf.j2 | 27 ++ .../templates/providers.tf.j2 | 10 + .../templates/resources.tf.j2 | 97 ++++ ansible/roles/persist_hostkeys/tasks/main.yml | 33 ++ .../persist_openhpc_secrets/tasks/main.yml | 35 ++ .../templates/openhpc_secrets.fact | 9 + ansible/roles/requirements.yml | 1 + ansible/roles/zenith_proxy/defaults/main.yml | 57 +++ .../files/podman-pod-infra-attach.sh | 17 + ansible/roles/zenith_proxy/tasks/main.yml | 103 ++++ .../zenith_proxy/templates/client.service.j2 | 33 ++ .../zenith_proxy/templates/mitm.service.j2 | 46 ++ .../zenith_proxy/templates/pod.service.j2 | 19 + .../templates/zenith-client.yaml.j2 | 27 ++ 26 files changed, 1481 insertions(+) create mode 100644 ansible/roles/cluster_infra/defaults/main.yml create mode 100644 ansible/roles/cluster_infra/tasks/main.yml create mode 100644 ansible/roles/cluster_infra/templates/outputs.tf.j2 create mode 100644 ansible/roles/cluster_infra/templates/providers.tf.j2 create mode 100644 ansible/roles/cluster_infra/templates/resources.tf.j2 create mode 100644 ansible/roles/image_build/defaults/main.yml create mode 100644 ansible/roles/image_build/tasks/main.yml create mode 100644 ansible/roles/image_build/tasks/prechecks.yml create mode 100644 ansible/roles/image_build/templates/ansible.cfg.j2 create mode 100644 ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 create mode 100644 ansible/roles/image_build_infra/defaults/main.yml create mode 100644 ansible/roles/image_build_infra/tasks/main.yml create mode 100644 ansible/roles/image_build_infra/templates/outputs.tf.j2 create mode 100644 ansible/roles/image_build_infra/templates/providers.tf.j2 create mode 100644 ansible/roles/image_build_infra/templates/resources.tf.j2 create mode 100644 ansible/roles/persist_hostkeys/tasks/main.yml create mode 100644 ansible/roles/persist_openhpc_secrets/tasks/main.yml create mode 100644 ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact create mode 120000 ansible/roles/requirements.yml create mode 100644 ansible/roles/zenith_proxy/defaults/main.yml create mode 100644 ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh create mode 100644 ansible/roles/zenith_proxy/tasks/main.yml create mode 100644 ansible/roles/zenith_proxy/templates/client.service.j2 create mode 100644 ansible/roles/zenith_proxy/templates/mitm.service.j2 create mode 100644 ansible/roles/zenith_proxy/templates/pod.service.j2 create mode 100644 ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml new file mode 100644 index 000000000..e22c86255 --- /dev/null +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -0,0 +1,60 @@ +##### +## WARNING +## +## The groups specified here should replicate the groups in the StackHPC Slurm appliance environments +## +## https://github.com/stackhpc/ansible-slurm-appliance/blob/main/environments/common/inventory/groups +## https://github.com/stackhpc/ansible-slurm-appliance/blob/main/environments/common/layouts/everything +##### +# These groups should represent the everything layout +cluster_groups_required: + login: ["{{ cluster_name }}_login"] + control: ["{{ cluster_name }}_control"] + compute: ["{{ cluster_name }}_compute"] + openhpc: [login, control, compute] + cluster: [openhpc] + selinux: [cluster] + nfs: [cluster] + mysql: [control] + update: [cluster] + basic_users: [cluster] + fail2ban: [login] + firewalld: [fail2ban] + # ignore these for the moment: + #etc_hosts: [] + # cloud_init: [etc_hosts] + systemd: [opensearch, grafana, control, prometheus] + +# These are the additional groups required for monitoring (see everything layout) +cluster_groups_monitoring: + podman: [opensearch, filebeat, mysql] + prometheus: [control] + grafana: [control] + alertmanager: [control] + node_exporter: [cluster] + opensearch: [control] + slurm_stats: [control] + filebeat: [slurm_stats] + +# Additional groups for OOD +cluster_groups_ood: + openondemand: [login] + openondemand_jupyter: [compute] + openondemand_desktop: [compute] + +# Additional groups for running the cluster validation +cluster_groups_validation: + hpctests: [login] + +# Additional groups for Zenith support +cluster_groups_zenith: + # Any hosts in the grafana and openondemand groups should go in the zenith group + zenith: [grafana, openondemand] + +cluster_deploy_ssh_keys_extra: [] + +# List of hw_scsi_models that result in block devices presenting as /dev/sdX +# rather than /dev/vdX +scsi_models: + # Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties] + - virtio-scsi diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml new file mode 100644 index 000000000..796411fb6 --- /dev/null +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -0,0 +1,138 @@ +- debug: + msg: | + terraform_backend_type: {{ terraform_backend_type }} + terraform_state: {{ terraform_state }} + cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }} + +# We need to convert the floating IP id to an address for Terraform +# if we we have cluster_floating_ip, otherwise assume that we're +# assigning the FIP in Terraform and that it will be available in +# outputs.cluster_gateway_ip. +- block: + - name: Look up floating IP + include_role: + name: stackhpc.terraform.infra + tasks_from: lookup_floating_ip + vars: + os_floating_ip_id: "{{ cluster_floating_ip }}" + + - name: Set floating IP address fact + set_fact: + cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}" + when: cluster_floating_ip is defined + +- name: Install Terraform binary + include_role: + name: stackhpc.terraform.install + +- name: Make Terraform project directory + file: + path: "{{ terraform_project_path }}" + state: directory + +- name: Write backend configuration + copy: + content: | + terraform { + backend "{{ terraform_backend_type }}" { } + } + dest: "{{ terraform_project_path }}/backend.tf" + +# Patching in this appliance is implemented as a switch to a new base image +# So unless explicitly patching, we want to use the same image as last time +# To do this, we query the previous Terraform state before updating +- block: + - name: Get previous Terraform state + stackhpc.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + project_path: "{{ terraform_project_path }}" + backend_config: "{{ terraform_backend_config }}" + register: cluster_infra_terraform_output + + - name: Extract image from Terraform state + set_fact: + cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}" + when: '"cluster_image" in cluster_infra_terraform_output.outputs' + when: + - terraform_state == "present" + - cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages + +- name: Detect volume device prefix from image metadata + block: + - name: Get image metadata from OpenStack API + openstack.cloud.image_info: + image: "{{ cluster_previous_image | default(cluster_image) }}" + register: cluster_image_info + - name: Check only single image found + assert: + that: cluster_image_info.images | length == 1 + fail_msg: "Multiple images found for 'cluster_image' {{ cluster_image }}" + - name: Set volume_device_prefix fact + set_fact: + block_device_prefix: >- + {{ + 'sd' if (cluster_image_info.images | first).hw_scsi_model is defined and + (cluster_image_info.images | first).hw_scsi_model in scsi_models + else 'vd' + }} + # Only run when block_device_prefix isn't set as an extravar + when: + - block_device_prefix is not defined + - cluster_image is defined + +- name: Template Terraform files into project directory + template: + src: >- + {{ + "{}{}.j2".format( + ( + cluster_terraform_template_dir ~ "/" + if cluster_terraform_template_dir is defined + else "" + ), + item + ) + }} + dest: "{{ terraform_project_path }}/{{ item }}" + loop: + - outputs.tf + - providers.tf + - resources.tf + +- name: Provision infrastructure + include_role: + name: stackhpc.terraform.infra + +# The hosts provisioned by Terraform are put into a primary group by the role +# These tasks then add those hosts to additional groups depending on the selected options +- name: Add cluster hosts to required groups + add_host: + name: "{{ item }}" + groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_required) }}" + loop: "{{ groups.get('cluster', []) }}" + +- name: Add cluster hosts to OOD groups + add_host: + name: "{{ item }}" + groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_ood) }}" + loop: "{{ groups.get('cluster', []) }}" + +- name: Add cluster hosts to monitoring groups + add_host: + name: "{{ item }}" + groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_monitoring) }}" + loop: "{{ groups.get('cluster', []) }}" + +- name: Add cluster hosts to validation groups + add_host: + name: "{{ item }}" + groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_validation) }}" + loop: "{{ groups.get('cluster', []) }}" + when: cluster_run_validation | default(false) | bool + +- name: Add cluster hosts to Zenith groups + add_host: + name: "{{ item }}" + groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_zenith) }}" + loop: "{{ groups.get('cluster', []) }}" + when: zenith_subdomain_monitoring is defined diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 new file mode 100644 index 000000000..75f46717c --- /dev/null +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -0,0 +1,53 @@ +output "cluster_gateway_ip" { + description = "The IP address of the gateway used to contact the cluster nodes" + value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip +} + +{% if cluster_ssh_private_key_file is not defined %} +output "cluster_ssh_private_key" { + description = "The private component of the keypair generated on cluster provision" + value = openstack_compute_keypair_v2.cluster_keypair.private_key + sensitive = true +} +{% endif %} + +output "cluster_nodes" { + description = "A list of the nodes in the cluster from which an Ansible inventory will be populated" + value = concat( + [ + { + name = openstack_compute_instance_v2.login.name + ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4 + groups = ["{{ cluster_name }}_login"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + }, + { + name = openstack_compute_instance_v2.control.name + ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + groups = ["{{ cluster_name }}_control"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ], + {% for partition in openhpc_slurm_partitions %} + [ + for compute in openstack_compute_instance_v2.{{ partition.name }}: { + name = compute.name + ip = compute.network[0].fixed_ip_v4 + groups = ["{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ]{{ ',' if not loop.last }} + {% endfor %} + ) +} + +output "cluster_image" { + description = "The id of the image used to build the cluster nodes" + value = "{{ cluster_previous_image | default(cluster_image) }}" +} diff --git a/ansible/roles/cluster_infra/templates/providers.tf.j2 b/ansible/roles/cluster_infra/templates/providers.tf.j2 new file mode 100644 index 000000000..32a16f27b --- /dev/null +++ b/ansible/roles/cluster_infra/templates/providers.tf.j2 @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 0.14" + + # We need the OpenStack provider + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 new file mode 100644 index 000000000..1a40361e4 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -0,0 +1,452 @@ +#jinja2: trim_blocks:False +##### +##### The identity scope we are operating in +##### Used to output the OpenStack project ID as a fact for provisioned hosts +##### +data "openstack_identity_auth_scope_v3" "scope" { + name = "{{ cluster_name }}" +} + +##### +##### Security groups for the cluster +##### + +# Security group to hold common rules for the cluster +resource "openstack_networking_secgroup_v2" "secgroup_slurm_cluster" { + name = "{{ cluster_name }}-secgroup-slurm-cluster" + description = "Rules for the slurm cluster nodes" + delete_default_rules = true # Fully manage with terraform +} + +# Security group to hold specific rules for the login node +resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { + name = "{{ cluster_name }}-secgroup-slurm-login" + description = "Specific rules for the slurm login node" + delete_default_rules = true # Fully manage with terraform +} + +## Allow all egress for all cluster nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow all ingress between nodes in the cluster +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_ingress_internal_v4" { + direction = "ingress" + ethertype = "IPv4" + remote_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow ingress on port 22 (SSH) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_ssh_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 22 + port_range_max = 22 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 443 (HTTPS) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_https_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 443 + port_range_max = 443 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 80 (HTTP) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_http_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 80 + port_range_max = 80 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +##### +##### Volumes +##### +resource "openstack_blockstorage_volume_v3" "state" { + name = "{{ cluster_name }}-state" + description = "State for control node" + size = "{{ state_volume_size }}" +} + +resource "openstack_blockstorage_volume_v3" "home" { + name = "{{ cluster_name }}-home" + description = "Home for control node" + size = "{{ home_volume_size }}" + {% if use_home_volume_type_fast is defined and use_home_volume_type_fast %} + {% if home_volume_type_fast is defined %} + volume_type = "{{ home_volume_type_fast }}" + {% endif %} + {% endif %} +} + +###### +###### Cluster network +###### + +# Always get cluster_external_network network and subnet data +data "openstack_networking_network_v2" "cluster_external_network" { + name = "{{ cluster_external_network }}" +} + +data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" { + network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +{% if cluster_network is not defined %} +# Create a new network +resource "openstack_networking_network_v2" "cluster_network" { + name = "{{ cluster_name }}-net" + admin_state_up = "true" +} + +resource "openstack_networking_subnet_v2" "cluster_subnet" { + name = "{{ cluster_name }}-subnet" + network_id = "${openstack_networking_network_v2.cluster_network.id}" + cidr = "{{ cluster_cidr | default('192.168.44.0/24') }}" + {% if cluster_nameservers is defined %} + dns_nameservers = [ + {% for nameserver in cluster_nameservers %} + "{{ nameserver }}"{{ ',' if not loop.last }} + {% endfor %} + ] + {% endif %} + ip_version = 4 +} + +resource "openstack_networking_router_v2" "cluster_router" { + name = "{{ cluster_name }}-router" + admin_state_up = true + external_network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +resource "openstack_networking_router_interface_v2" "cluster_router_interface" { + router_id = "${openstack_networking_router_v2.cluster_router.id}" + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" +} +{% endif %} + +# Get existing network resource data by name, from either the created +# network or the network name if supplied +data "openstack_networking_network_v2" "cluster_network" { + {% if cluster_network is not defined %} + network_id = "${openstack_networking_network_v2.cluster_network.id}" + {% else %} + name = "{{ cluster_network }}" + {% endif %} +} + +data "openstack_networking_subnet_v2" "cluster_subnet" { + # Get subnet data from the subnet we create, or if it exists already + # get it from the cluster network data above + {% if cluster_network is not defined %} + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" + {% else %} + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + {% endif %} +} + +##### +##### Cluster ports +##### + +resource "openstack_networking_port_v2" "login" { + name = "{{ cluster_name }}-login-0" + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + admin_state_up = "true" + + fixed_ip { + subnet_id = "${data.openstack_networking_subnet_v2.cluster_subnet.id}" + } + + security_group_ids = [ + "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}", + "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" + ] + + binding { + vnic_type = "{{ cluster_vnic_type | default('normal') }}" + {% if cluster_vnic_profile is defined %} + profile = <- + {% if image_build_ssh_bastion_host is defined %} + '-o ProxyCommand="ssh -W %h:%p -q + {% if image_build_ssh_bastion_private_key_file is defined %} + -i {{ image_build_ssh_bastion_private_key_file }} + {% endif %} + -l {{ image_build_ssh_bastion_username }} + {{ image_build_ssh_bastion_host }}"' + {% else %} + "" + {% endif %} diff --git a/ansible/roles/image_build/tasks/main.yml b/ansible/roles/image_build/tasks/main.yml new file mode 100644 index 000000000..5dc9b6450 --- /dev/null +++ b/ansible/roles/image_build/tasks/main.yml @@ -0,0 +1,84 @@ +--- + +- name: Run prechecks + include_tasks: prechecks.yml + +- name: Create temporary file for pkrvars.hcl + ansible.builtin.tempfile: + state: file + suffix: .pkrvars.hcl + register: pkrvars_hcl_file + +- name: Make Packer vars file + template: + src: builder.pkrvars.hcl.j2 + dest: "{{ pkrvars_hcl_file.path }}" + +- name: Create temporary image-build inventory directory + ansible.builtin.tempfile: + state: directory + prefix: image-build. + register: image_build_inventory + +- name: Symlink "everything" layout to image-build inventory + file: + state: link + src: "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/environments/common/layouts/everything" + dest: "{{ image_build_inventory.path }}/groups" + +- name: Symlink CAAS group_vars to image-build inventory + file: + state: link + src: "{{ playbook_dir }}/group_vars" + dest: "{{ image_build_inventory.path }}/group_vars" + +- name: Add builder vars to image-build inventory hosts file + copy: + dest: "{{ image_build_inventory.path }}/hosts" + content: | + {% raw %} + localhost ansible_connection=local ansible_python_interpreter="{{ ansible_playbook_python }}" + {% endraw %} + [builder:vars] + {% if image_build_ssh_bastion_host is defined %} + ansible_ssh_common_args={{ image_build_ansible_ssh_common_args }} + {% endif %} + {% for k,v in image_build_builder_group_vars.items() -%} + {{ k }}={{ v }} + {% endfor -%} + +- name: Create temporary file for ansible.cfg + ansible.builtin.tempfile: + state: file + suffix: ansible.cfg + register: ansible_cfg_file + +- name: Template image-build ansible.cfg + template: + src: ansible.cfg.j2 + dest: "{{ ansible_cfg_file.path }}" + +- name: Packer init + command: + cmd: | + packer init . + chdir: "{{ image_build_packer_root_path }}" + +- name: Build image with packer + command: + cmd: | + packer build -only openstack.openhpc -var-file={{ pkrvars_hcl_file.path }} openstack.pkr.hcl + chdir: "{{ image_build_packer_root_path }}" + environment: + APPLIANCES_ENVIRONMENT_ROOT: "{{ image_build_appliances_environment_root }}" + ANSIBLE_CONFIG: "{{ ansible_cfg_file.path }}" + PACKER_LOG: "1" + PACKER_LOG_PATH: "{{ lookup('ansible.builtin.env', 'PACKER_LOG_PATH', default='/tmp/packer-build.log') }}" + +- name: Parse packer-manifest.json + set_fact: + packer_manifest: "{{ lookup('file', '/tmp/builder.manifest.json') | from_json }}" + +- name: Extract image-build data + set_fact: + image_build_data: "{{ packer_manifest.builds | selectattr('packer_run_uuid', 'eq', packer_manifest.last_run_uuid) | first }}" diff --git a/ansible/roles/image_build/tasks/prechecks.yml b/ansible/roles/image_build/tasks/prechecks.yml new file mode 100644 index 000000000..38f1ff15e --- /dev/null +++ b/ansible/roles/image_build/tasks/prechecks.yml @@ -0,0 +1,22 @@ +--- + +- name: Check required vars are defined + assert: + that: + - "{{ item }} is defined" + fail_msg: "{{ item }} is not defined" + loop: + - image_build_network_id + - image_build_floating_ip_network + - image_build_source_image_id + - image_build_security_group_id + +- name: Ensure builder access mode + fail: + msg: >- + Set either image_build_ssh_bastion_host or + image_build_attach_floating_ip to access the image + build instance via a bastion or directly + when: + - image_build_ssh_bastion_host is defined + - image_build_attach_floating_ip is defined and image_build_attach_floating_ip diff --git a/ansible/roles/image_build/templates/ansible.cfg.j2 b/ansible/roles/image_build/templates/ansible.cfg.j2 new file mode 100644 index 000000000..acfd294ab --- /dev/null +++ b/ansible/roles/image_build/templates/ansible.cfg.j2 @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +gathering = smart +host_key_checking = False +remote_tmp = /tmp +roles_path = {{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/ansible/roles +inventory = {{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/environments/common/inventory,{{ image_build_inventory.path }} + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True +# This is important because we are using one of the hosts in the play as a jump host +# This ensures that if the proxy connection is interrupted, rendering the other hosts +# unreachable, the connection is retried instead of failing the entire play +retries = 10 diff --git a/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 b/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 new file mode 100644 index 000000000..d1753225b --- /dev/null +++ b/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 @@ -0,0 +1,38 @@ +repo_root = "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance" +environment_root = "{{ playbook_dir }}/image_build" +networks = ["{{ image_build_network_id }}"] +{% if image_build_ssh_username is defined %} +ssh_username = "{{ image_build_ssh_username }}" +{% endif %} +{% if image_build_ssh_bastion_host is defined %} +ssh_bastion_host = "{{ image_build_ssh_bastion_host }}" +{% endif %} +{% if image_build_ssh_bastion_username is defined %} +ssh_bastion_username = "{{ image_build_ssh_bastion_username }}" +{% endif %} +{% if image_build_ssh_bastion_private_key_file is defined %} +ssh_bastion_private_key_file = "{{ image_build_ssh_bastion_private_key_file }}" +{% endif %} +{% if image_build_attach_floating_ip %} +floating_ip_network = "{{ image_build_floating_ip_network }}" +{% endif %} +security_groups = ["{{ image_build_security_group_id }}"] +fatimage_source_image = "{{ image_build_source_image_id }}" +{% if image_build_ssh_keypair_name is defined %} +ssh_keypair_name = "{{ image_build_ssh_keypair_name }}" +{% endif %} +{% if image_build_ssh_private_key_file is defined %} +ssh_private_key_file = "{{ image_build_ssh_private_key_file }}" +{% endif %} +flavor = "{{ image_build_flavor_name }}" +metadata = { +{% for k,v in image_build_metadata.items() %} + "{{ k }}" = "{{ v }}" +{% endfor %} +} +use_blockstorage_volume = {{ image_build_use_blockstorage_volume | string | lower }} +{% if image_build_use_blockstorage_volume %} +volume_size = {{ image_build_volume_size }} +image_disk_format = "{{ image_build_image_disk_format }}" +{% endif %} +manifest_output_path = "/tmp/builder.manifest.json" diff --git a/ansible/roles/image_build_infra/defaults/main.yml b/ansible/roles/image_build_infra/defaults/main.yml new file mode 100644 index 000000000..adce2f827 --- /dev/null +++ b/ansible/roles/image_build_infra/defaults/main.yml @@ -0,0 +1,12 @@ +--- + +image_build_terraform_project_path: "{{ playbook_dir }}/terraform-caas-image-build" +image_build_cluster_name: "caas-image-build" + +# Regex to capture existing cloud image names to use as the +# OpenHPC Slurm base-image +image_build_existing_image_regex: "^Rocky-8-GenericCloud-Base-8.8-.*" +# Attributes to sort the list of existing base images returned by +# image_build_existing_image_regex. See +# https://registry.terraform.io/providers/terraform-provider-openstack/openstack/latest/docs/data-sources/images_image_ids_v2#sort +image_build_existing_image_sort_attributes: "name,updated_at" diff --git a/ansible/roles/image_build_infra/tasks/main.yml b/ansible/roles/image_build_infra/tasks/main.yml new file mode 100644 index 000000000..17dbc8566 --- /dev/null +++ b/ansible/roles/image_build_infra/tasks/main.yml @@ -0,0 +1,45 @@ +--- +- name: Install Terraform binary + include_role: + name: stackhpc.terraform.install + +- name: Make Terraform project directory + file: + path: "{{ image_build_terraform_project_path }}" + state: directory + +- name: Write backend configuration + copy: + content: | + terraform { + backend "{{ terraform_backend_type }}" { } + } + dest: "{{ image_build_terraform_project_path }}/backend.tf" + +- name: Template Terraform files into project directory + template: + src: "{{ item }}.j2" + dest: "{{ image_build_terraform_project_path }}/{{ item }}" + loop: + - outputs.tf + - providers.tf + - resources.tf + +- name: Provision infrastructure using Terraform + terraform: + binary_path: "{{ terraform_binary_path or omit }}" + project_path: "{{ image_build_terraform_project_path }}" + state: "{{ terraform_state }}" + backend_config: "{{ terraform_backend_config }}" + force_init: yes + init_reconfigure: yes + variables: "{{ image_build_terraform_variables | default(omit) }}" + register: image_build_terraform_provision + +- name: Set image build infrastructure facts + set_fact: + image_build_network_id: "{{ image_build_terraform_provision.outputs.network_id.value }}" + image_build_floating_ip_network: "{{ image_build_terraform_provision.outputs.floating_ip_network_id.value }}" + image_build_source_image_id: "{{ image_build_terraform_provision.outputs.source_image_name.value }}" + image_build_security_group_id: "{{ image_build_terraform_provision.outputs.security_group_id.value }}" + when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") diff --git a/ansible/roles/image_build_infra/templates/outputs.tf.j2 b/ansible/roles/image_build_infra/templates/outputs.tf.j2 new file mode 100644 index 000000000..447ae9653 --- /dev/null +++ b/ansible/roles/image_build_infra/templates/outputs.tf.j2 @@ -0,0 +1,27 @@ +output "network_id" { + description = "The image build network ID" + value = data.openstack_networking_network_v2.caas_image_build_network.id +} + +output "source_image_name" { + description = "The id of the image used to build the cluster nodes" + {% if image_build_source_image_id is defined %} + value = "{{ image_build_source_image_id }}" + {% else %} + value = data.openstack_images_image_ids_v2.image_build_source_image.ids[0] + {% endif %} +} + +output "floating_ip_network_id" { + description = "Network to allocate floating IPs from" + value = data.openstack_networking_network_v2.caas_image_build_external_network.id +} + +output "security_group_id" { + description = "Security group ID to associate with the builder instance" + {% if image_build_security_group_id is defined %} + value = "{{ image_build_security_group_id }}" + {% else %} + value = openstack_networking_secgroup_v2.caas_image_build_secgroup.id + {% endif %} +} diff --git a/ansible/roles/image_build_infra/templates/providers.tf.j2 b/ansible/roles/image_build_infra/templates/providers.tf.j2 new file mode 100644 index 000000000..32a16f27b --- /dev/null +++ b/ansible/roles/image_build_infra/templates/providers.tf.j2 @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 0.14" + + # We need the OpenStack provider + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} diff --git a/ansible/roles/image_build_infra/templates/resources.tf.j2 b/ansible/roles/image_build_infra/templates/resources.tf.j2 new file mode 100644 index 000000000..0f8233ae9 --- /dev/null +++ b/ansible/roles/image_build_infra/templates/resources.tf.j2 @@ -0,0 +1,97 @@ +#jinja2: trim_blocks:False + +###### +###### Image build network +###### + +data "openstack_networking_network_v2" "caas_image_build_external_network" { + external = true +} + +{% if image_build_network_id is not defined %} +{% if image_build_network_name is not defined %} +# Create a network +resource "openstack_networking_network_v2" "caas_image_build_network" { + name = "{{ image_build_cluster_name }}" + admin_state_up = "true" +} + +resource "openstack_networking_subnet_v2" "caas_image_build_subnet" { + name = "{{ image_build_cluster_name }}" + network_id = "${openstack_networking_network_v2.caas_image_build_network.id}" + cidr = "192.168.244.0/24" + {% if image_build_nameservers is defined %} + dns_nameservers = [ + {% for nameserver in image_build_nameservers %} + "{{ nameserver }}"{{ ',' if not loop.last }} + {% endfor %} + ] + {% endif %} + ip_version = 4 +} + +resource "openstack_networking_router_v2" "caas_image_build_router" { + name = "{{ image_build_cluster_name }}" + admin_state_up = true + external_network_id = "${data.openstack_networking_network_v2.caas_image_build_external_network.id}" +} + +resource "openstack_networking_router_interface_v2" "caas_image_build_router_interface" { + router_id = "${openstack_networking_router_v2.caas_image_build_router.id}" + subnet_id = "${openstack_networking_subnet_v2.caas_image_build_subnet.id}" +} +{% endif %} +{% endif %} + +# Get existing network resource data by name, from either the created +# network or the network name if supplied +data "openstack_networking_network_v2" "caas_image_build_network" { + {% if image_build_network_id is defined %} + network_id = "{{ image_build_network_id }}" + {% elif image_build_network_name is defined %} + name = "{{ image_build_network_name }}" + {% else %} + network_id = "${openstack_networking_network_v2.caas_image_build_network.id}" + {% endif %} +} + +{% if image_build_source_image_id is not defined %} +###### +###### Image build base image +###### + +data "openstack_images_image_ids_v2" "image_build_source_image" { + name_regex = "{{ image_build_existing_image_regex }}" + sort = "{{ image_build_existing_image_sort_attributes }}" +} +{% endif %} + +{% if image_build_security_group_id is not defined %} +###### +###### Image build security groups +###### + +# Security group to hold specific rules for the image build instance +resource "openstack_networking_secgroup_v2" "caas_image_build_secgroup" { + name = "{{ image_build_cluster_name }}" + description = "Specific rules for caas image build" + delete_default_rules = true # Fully manage with terraform +} + +## Allow all egress for the image build instance +resource "openstack_networking_secgroup_rule_v2" "caas_image_build_secgroup_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = "${openstack_networking_secgroup_v2.caas_image_build_secgroup.id}" +} + +## Allow ingress on port 22 (SSH) from anywhere for the image build instance +resource "openstack_networking_secgroup_rule_v2" "caas_image_build_secgroup_ingress_ssh_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 22 + port_range_max = 22 + security_group_id = "${openstack_networking_secgroup_v2.caas_image_build_secgroup.id}" +} +{% endif %} \ No newline at end of file diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml new file mode 100644 index 000000000..47493220d --- /dev/null +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -0,0 +1,33 @@ +--- + +- name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" + state: directory + owner: root + group: root + mode: 0600 + +- name: Copy hostkeys from persistent storage + # won't fail if no keys are in persistent storage + copy: + src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + dest: /etc/ssh/ + remote_src: true + +- name: Find hostkeys + find: + path: /etc/ssh/ + patterns: ssh_host_*_key* + register: _find_ssh_keys + +- name: Persist hostkeys + copy: + dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + src: "{{ item }}" + remote_src: true + mode: preserve + loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + +- meta: reset_connection + diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml new file mode 100644 index 000000000..6ae9bcd59 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -0,0 +1,35 @@ +--- + +- name: Check if OpenHPC secrets exist in persistent storage + stat: + path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + register: openhpc_secrets_stat + +- name: Ensure Ansible facts directories exist + file: + path: "{{ item }}" + state: directory + owner: root + mode: 0600 + loop: + - "{{ appliances_state_dir }}/ansible.facts.d" + - "/etc/ansible/facts.d" + +- name: Write OpenHPC secrets + template: + src: openhpc_secrets.fact + dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + owner: root + mode: 0600 + when: "not openhpc_secrets_stat.stat.exists" + +- name: Symlink persistent facts to facts_path + file: + state: link + src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + dest: /etc/ansible/facts.d/openhpc_secrets.fact + owner: root + +- name: Read facts + ansible.builtin.setup: + filter: ansible_local diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact new file mode 100644 index 000000000..9d6de37d8 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -0,0 +1,9 @@ +{ + "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}", + "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", + "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" +} diff --git a/ansible/roles/requirements.yml b/ansible/roles/requirements.yml new file mode 120000 index 000000000..6e76d5252 --- /dev/null +++ b/ansible/roles/requirements.yml @@ -0,0 +1 @@ +../requirements.yml \ No newline at end of file diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml new file mode 100644 index 000000000..70e92d648 --- /dev/null +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -0,0 +1,57 @@ +--- + +zenith_registrar_url: "{{ undef(hint = 'zenith_registrar_url is required') }}" +zenith_registrar_verify_ssl: true +zenith_sshd_host: "{{ undef(hint = 'zenith_sshd_host is required') }}" +zenith_sshd_port: 22 + +zenith_proxy_podman_user: "{{ ansible_user }}" + +zenith_proxy_service_name: "{{ undef(hint = 'zenith_proxy_service_name is required') }}" +zenith_proxy_client_service_name: "{{ zenith_proxy_service_name }}-client" +zenith_proxy_mitm_service_name: "{{ zenith_proxy_service_name }}-mitm" + +zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" +zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" +zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" + +zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image_tag: main +zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_client_image_tag }}" + +zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image_tag: main +zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_mitm_image_tag }}" + +zenith_proxy_upstream_scheme: http +zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" +zenith_proxy_upstream_port: "{{ undef(hint = 'zenith_proxy_upstream_port is required') }}" +zenith_proxy_upstream_read_timeout: + +zenith_proxy_client_token: "{{ undef(hint = 'zenith_proxy_client_token is required') }}" +zenith_proxy_client_auth_skip: false +zenith_proxy_client_auth_params: {} + +zenith_proxy_mitm_enabled: no +zenith_proxy_mitm_listen_port: 8080 +zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' +zenith_proxy_mitm_auth_basic_username: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_username is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_basic_password: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_password is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_bearer_header_name: Authorization +zenith_proxy_mitm_auth_bearer_header_prefix: Bearer +zenith_proxy_mitm_auth_bearer_token: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_bearer_token is required') + if zenith_proxy_mitm_auth_inject == "bearer" + else None + }} diff --git a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh new file mode 100644 index 000000000..aab232a0a --- /dev/null +++ b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +##### +# Small script that can be used to attach to the infra container of a pod +# +# Useful in a systemd service that starts a pod in order to track the execution +# +# Accepts a single argument which is the name of the pod whose infra container we should attach to +##### + +set -e + +echo "[INFO] Finding infra container for pod '$1'" +INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" + +echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" +exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml new file mode 100644 index 000000000..1a42b0438 --- /dev/null +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -0,0 +1,103 @@ +--- + +- name: Install script for attaching to pod infra containers + copy: + src: podman-pod-infra-attach.sh + dest: /usr/bin/ + mode: +x + become: true + +- name: Create systemd unit for Zenith pod + template: + src: pod.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_service_name }}.service + become: true + register: zenith_proxy_pod_systemd_unit + +- name: Ensure Zenith pod is started and enabled + service: + name: "{{ zenith_proxy_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_pod_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_pod_systemd_unit is changed }}" + become: true + +- block: + - name: Create systemd unit file for MITM proxy + template: + src: mitm.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_mitm_service_name }}.service + register: zenith_proxy_mitm_systemd_unit + + - name: Ensure MITM proxy is started and enabled + service: + name: "{{ zenith_proxy_mitm_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_mitm_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_mitm_systemd_unit is changed }}" + become: true + when: zenith_proxy_mitm_enabled + +- name: Ensure Zenith config directory exists + file: + path: /etc/zenith/{{ zenith_proxy_service_name }} + state: directory + become: true + +- name: Write Zenith client configuration + template: + src: zenith-client.yaml.j2 + dest: /etc/zenith/{{ zenith_proxy_service_name }}/client.yaml + become: true + register: zenith_proxy_client_config_file + +- name: Create directory to persist SSH key + file: + path: "{{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh" + state: directory + owner: "{{ zenith_proxy_podman_user }}" + group: "{{ zenith_proxy_podman_user }}" + become: true + +- name: Initialise Zenith client + # Use a foreground command rather than the podman_container module as I could not + # work out the combination of parameters that produced the desired behaviour :-( + command: >- + podman run + --name {{ zenith_proxy_service_name }}-init + --replace + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh + {{ zenith_proxy_client_image }} + zenith-client init + become: true + become_user: "{{ zenith_proxy_podman_user }}" + register: zenith_proxy_client_init + changed_when: zenith_proxy_client_init.rc == 0 + failed_when: >- + zenith_proxy_client_init.rc != 0 and + "token has already been used" not in zenith_proxy_client_init.stderr + +- name: Create systemd unit file for Zenith client + template: + src: client.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_client_service_name }}.service + become: true + register: zenith_proxy_client_systemd_unit + +- name: Ensure Zenith client is started and enabled + service: + name: "{{ zenith_proxy_client_service_name }}.service" + state: >- + {{ + 'restarted' + if ( + zenith_proxy_client_config_file is changed or + zenith_proxy_client_systemd_unit is changed or + zenith_proxy_client_init is changed + ) + else 'started' + }} + enabled: yes + daemon_reload: "{{ zenith_proxy_client_systemd_unit is changed }}" + become: true diff --git a/ansible/roles/zenith_proxy/templates/client.service.j2 b/ansible/roles/zenith_proxy/templates/client.service.j2 new file mode 100644 index 000000000..ba4acf0b7 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/client.service.j2 @@ -0,0 +1,33 @@ +[Unit] +Description=Podman {{ zenith_proxy_client_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service +{% if zenith_proxy_mitm_enabled %} +Wants={{ zenith_proxy_mitm_service_name }}.service +After={{ zenith_proxy_mitm_service_name }}.service +{% endif %} + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_client_container_name }} \ + --security-opt label=disable \ + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro \ + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh \ + {{ zenith_proxy_client_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_client_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_client_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/mitm.service.j2 b/ansible/roles/zenith_proxy/templates/mitm.service.j2 new file mode 100644 index 000000000..d8b3c954b --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/mitm.service.j2 @@ -0,0 +1,46 @@ + + +[Unit] +Description=Podman {{ zenith_proxy_mitm_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_mitm_container_name }} \ + --security-opt label=disable \ + --env ZENITH_PROXY_LISTEN_PORT={{ zenith_proxy_mitm_listen_port }} \ + --env ZENITH_PROXY_UPSTREAM_SCHEME={{ zenith_proxy_upstream_scheme }} \ + --env ZENITH_PROXY_UPSTREAM_HOST={{ zenith_proxy_upstream_host }} \ + --env ZENITH_PROXY_UPSTREAM_PORT={{ zenith_proxy_upstream_port }} \ +{% if zenith_proxy_upstream_read_timeout %} + --env ZENITH_PROXY_READ_TIMEOUT={{ zenith_proxy_upstream_read_timeout }} \ +{% endif %} +{% if zenith_proxy_mitm_auth_inject == "basic" %} + --env ZENITH_PROXY_AUTH_INJECT=basic \ + --env ZENITH_PROXY_AUTH_BASIC_USERNAME={{ zenith_proxy_mitm_auth_basic_username }} \ + --env {{ "ZENITH_PROXY_AUTH_BASIC_PASSWORD={}".format(zenith_proxy_mitm_auth_basic_password) | quote }} \ +{% elif zenith_proxy_mitm_auth_inject == "bearer" %} + --env ZENITH_PROXY_AUTH_INJECT=bearer \ + --env ZENITH_PROXY_AUTH_BEARER_HEADER={{ zenith_proxy_mitm_auth_bearer_header_name }} \ + --env ZENITH_PROXY_AUTH_BEARER_PREFIX={{ zenith_proxy_mitm_auth_bearer_header_prefix }} \ + --env ZENITH_PROXY_AUTH_BEARER_TOKEN={{ zenith_proxy_mitm_auth_bearer_token }} \ +{% endif %} + {{ zenith_proxy_mitm_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_mitm_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_mitm_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 new file mode 100644 index 000000000..d46617556 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -0,0 +1,19 @@ +[Unit] +Description=Podman {{ zenith_proxy_service_name }}.service +Wants=network.target +After=network-online.target + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} +ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} +ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} +ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} +ExecStopPost=/usr/bin/podman pod rm --ignore -f {{ zenith_proxy_pod_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 new file mode 100644 index 000000000..c037d7dc6 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 @@ -0,0 +1,27 @@ +ssh_identity_path: /home/zenith/.ssh/id_zenith + +# Init options +registrar_url: {{ zenith_registrar_url }} +token: {{ zenith_proxy_client_token }} +verify_ssl: {{ 'yes' if zenith_registrar_verify_ssl else 'no' }} + +# Connect options +server_address: {{ zenith_sshd_host }} +server_port: {{ zenith_sshd_port }} +{% if zenith_proxy_mitm_enabled %} +backend_protocol: http +forward_to_host: 127.0.0.1 +forward_to_port: {{ zenith_proxy_mitm_listen_port }} +{% else %} +backend_protocol: {{ zenith_proxy_upstream_scheme }} +forward_to_host: {{ zenith_proxy_upstream_host }} +forward_to_port: {{ zenith_proxy_upstream_port }} +{% endif %} +{% if zenith_proxy_upstream_read_timeout %} +read_timeout: {{ zenith_proxy_upstream_read_timeout }} +{% endif %} +skip_auth: {{ 'yes' if zenith_proxy_client_auth_skip else 'no' }} +{% if zenith_proxy_client_auth_params %} +auth_params: + {{ zenith_proxy_client_auth_params | to_nice_yaml | indent(2) }} +{% endif %} From a539de406383738e56d8d035f7688feded93c826 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:05:00 +0000 Subject: [PATCH 03/48] add ui-meta from caas@59d1b299f47404ca50283b31bf2508830052c3bf --- ui-meta/slurm-infra-fast-volume-type.yml | 115 +++++++++++++++++++++++ ui-meta/slurm-infra.yml | 101 ++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 ui-meta/slurm-infra-fast-volume-type.yml create mode 100644 ui-meta/slurm-infra.yml diff --git a/ui-meta/slurm-infra-fast-volume-type.yml b/ui-meta/slurm-infra-fast-volume-type.yml new file mode 100644 index 000000000..899e3b439 --- /dev/null +++ b/ui-meta/slurm-infra-fast-volume-type.yml @@ -0,0 +1,115 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: use_home_volume_type_fast + label: Provision high-performance storage for home directories + description: | + If a high-performance storage type is available to the Slurm platform, + use it for cluster home directories. If no high-performance storage type + is available, this option has no effect and a standard cloud volume will + be provisioned for home directories. + kind: boolean + required: false + default: true + options: + checkboxLabel: Put home directories on high-performance storage? + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/caas-slurm-appliance/raw/main/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png + diff --git a/ui-meta/slurm-infra.yml b/ui-meta/slurm-infra.yml new file mode 100644 index 000000000..ed953b926 --- /dev/null +++ b/ui-meta/slurm-infra.yml @@ -0,0 +1,101 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/caas-slurm-appliance/raw/main/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png From 22a838f688098726f7a4375d9a4085d55f100f74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:06:18 +0000 Subject: [PATCH 04/48] add slurm-infra playbooks from caas@59d1b299f47404ca50283b31bf2508830052c3bf --- ansible/slurm-infra-fast-volume-type.yml | 140 +++++++++++++++++++++++ ansible/slurm-infra.yml | 140 +++++++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 ansible/slurm-infra-fast-volume-type.yml create mode 100644 ansible/slurm-infra.yml diff --git a/ansible/slurm-infra-fast-volume-type.yml b/ansible/slurm-infra-fast-volume-type.yml new file mode 100644 index 000000000..95a309a8a --- /dev/null +++ b/ansible/slurm-infra-fast-volume-type.yml @@ -0,0 +1,140 @@ +--- + +# Provision the infrastructure using Terraform +- name: Provision infrastructure + hosts: openstack + roles: + - cluster_infra + +# Setup tasks now that all hosts have been added to the correct groups +- hosts: cluster + become: yes + tasks: + # Ensure that the hosts in the cluster can all refer to each other by their hostname + - name: Populate /etc/hosts with cluster hosts + lineinfile: + path: /etc/hosts + regexp: "{{ hostvars[host].inventory_hostname }}" + line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" + loop: "{{ ansible_play_hosts }}" + loop_control: + loop_var: host + +# Ensure that the secrets are generated and persisted on the control host +- name: Generate and persist secrets + hosts: control + gather_facts: no + become: yes + roles: + - persist_openhpc_secrets + +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml + when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") + +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# This can disrupt the SSH connection, particularly because we use the login host as a jump host +# So we move the home directory on the login node and reset the connections first +- hosts: login + gather_facts: false + tasks: + - name: Set up Ansible user + user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + become: true + +- hosts: cluster + gather_facts: no + tasks: + - name: Reset persistent SSH connections + meta: reset_connection + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml + +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} diff --git a/ansible/slurm-infra.yml b/ansible/slurm-infra.yml new file mode 100644 index 000000000..95a309a8a --- /dev/null +++ b/ansible/slurm-infra.yml @@ -0,0 +1,140 @@ +--- + +# Provision the infrastructure using Terraform +- name: Provision infrastructure + hosts: openstack + roles: + - cluster_infra + +# Setup tasks now that all hosts have been added to the correct groups +- hosts: cluster + become: yes + tasks: + # Ensure that the hosts in the cluster can all refer to each other by their hostname + - name: Populate /etc/hosts with cluster hosts + lineinfile: + path: /etc/hosts + regexp: "{{ hostvars[host].inventory_hostname }}" + line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" + loop: "{{ ansible_play_hosts }}" + loop_control: + loop_var: host + +# Ensure that the secrets are generated and persisted on the control host +- name: Generate and persist secrets + hosts: control + gather_facts: no + become: yes + roles: + - persist_openhpc_secrets + +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml + when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") + +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# This can disrupt the SSH connection, particularly because we use the login host as a jump host +# So we move the home directory on the login node and reset the connections first +- hosts: login + gather_facts: false + tasks: + - name: Set up Ansible user + user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + become: true + +- hosts: cluster + gather_facts: no + tasks: + - name: Reset persistent SSH connections + meta: reset_connection + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml + +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} From b08454eda0202c51846739a41979ee5bee1fce44 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:12:29 +0000 Subject: [PATCH 05/48] update gitignore for caas roles --- ansible/.gitignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ansible/.gitignore b/ansible/.gitignore index 6883c6ae5..4bf56fe3b 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -44,3 +44,16 @@ roles/* !roles/resolv_conf/** !roles/cve-2023-41914 !roles/cve-2023-41914/** +!roles/cluster_infra/ +!roles/cluster_infra/** +!roles/image_build_infra/ +!roles/image_build_infra/** +!roles/persist_openhpc_secrets/ +!roles/persist_openhpc_secrets/** +!roles/zenith_proxy/ +!roles/zenith_proxy/** +!roles/image_build/ +!roles/image_build/** +!roles/persist_hostkeys/ +!roles/persist_hostkeys/** +!roles/requirements.yml From e38366fa942158655ebdd247a5c74a7f23543c42 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:12:52 +0000 Subject: [PATCH 06/48] create ansible.cfg based on caas@59d1b299f47404ca50283b31bf2508830052c3bf --- ansible.cfg | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 ansible.cfg diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 000000000..d54bf087c --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,20 @@ +# Only used for Azimuth running the caas environment +[defaults] +any_errors_fatal = True +gathering = smart +forks = 30 +host_key_checking = False +inventory = environments/common/inventory,environments/.caas/inventory +remote_tmp = /tmp +collections_path = ansible/collections +roles_path = ansible/roles +filter_plugins = ansible/filter_plugins +callbacks_enabled = ansible.posix.profile_tasks + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True +# This is important because we are using one of the hosts in the play as a jump host +# This ensures that if the proxy connection is interrupted, rendering the other hosts +# unreachable, the connection is retried instead of failing the entire play +retries = 10 From d174644ef84efe70075cb061616d44ba2f0eb351 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:39:28 +0000 Subject: [PATCH 07/48] create .caas invironment group_vars based on caas@59d1b299f47404ca50283b31bf2508830052c3bf --- environments/.caas/README.md | 5 ++ environments/.caas/activate | 23 ++++++ environments/.caas/ansible.cfg | 15 ++++ environments/.caas/cloud_init/.gitkeep | 0 environments/.caas/hooks/.gitkeep | 0 .../.caas/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/cluster.yml | 21 ++++++ .../inventory/group_vars/all/grafana.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 6 ++ .../.caas/inventory/group_vars/all/nfs.yml | 17 +++++ .../inventory/group_vars/all/openhpc.yml | 5 ++ .../inventory/group_vars/all/openondemand.yml | 75 +++++++++++++++++++ .../inventory/group_vars/all/prometheus.yml | 11 +++ .../inventory/group_vars/all/selinux.yml | 1 + .../.caas/inventory/group_vars/all/zenith.yml | 1 + .../.caas/inventory/group_vars/openstack.yml | 29 +++++++ environments/.caas/inventory/groups | 68 +++++++++++++++++ 17 files changed, 278 insertions(+) create mode 100644 environments/.caas/README.md create mode 100644 environments/.caas/activate create mode 100644 environments/.caas/ansible.cfg create mode 100644 environments/.caas/cloud_init/.gitkeep create mode 100644 environments/.caas/hooks/.gitkeep create mode 100644 environments/.caas/inventory/group_vars/all/.gitkeep create mode 100644 environments/.caas/inventory/group_vars/all/cluster.yml create mode 100644 environments/.caas/inventory/group_vars/all/grafana.yml create mode 100644 environments/.caas/inventory/group_vars/all/hpctests.yml create mode 100644 environments/.caas/inventory/group_vars/all/nfs.yml create mode 100644 environments/.caas/inventory/group_vars/all/openhpc.yml create mode 100644 environments/.caas/inventory/group_vars/all/openondemand.yml create mode 100644 environments/.caas/inventory/group_vars/all/prometheus.yml create mode 100644 environments/.caas/inventory/group_vars/all/selinux.yml create mode 100644 environments/.caas/inventory/group_vars/all/zenith.yml create mode 100644 environments/.caas/inventory/group_vars/openstack.yml create mode 100644 environments/.caas/inventory/groups diff --git a/environments/.caas/README.md b/environments/.caas/README.md new file mode 100644 index 000000000..972640bae --- /dev/null +++ b/environments/.caas/README.md @@ -0,0 +1,5 @@ +# Caas cluster + +Default Azimuth Slurm + +See the main README.md in the repo root for an overview and general install instructions. Any environment-specific instructions should be added here. \ No newline at end of file diff --git a/environments/.caas/activate b/environments/.caas/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/.caas/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg new file mode 100644 index 000000000..54a1c2a50 --- /dev/null +++ b/environments/.caas/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/.caas/cloud_init/.gitkeep b/environments/.caas/cloud_init/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/.caas/hooks/.gitkeep b/environments/.caas/hooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/.caas/inventory/group_vars/all/.gitkeep b/environments/.caas/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml new file mode 100644 index 000000000..f416c2782 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -0,0 +1,21 @@ +# Account for the fact we are running outside of the expected environment system: +# NB: this only works for playbooks in ansible/*, not in ansible/adhoc! +appliances_repository_root: "{{ playbook_dir }}/../" + +# Read the secrets from the Ansible local facts on the control host +vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}" +vault_grafana_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_grafana_admin_password }}" +vault_elasticsearch_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_admin_password }}" +vault_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_kibana_password }}" +vault_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_root_password }}" +vault_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_slurm_password }}" +vault_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_openhpc_mungekey }}" + +# Override this to cope with the case where the podman group just doesn't exist +appliances_local_users_podman_enable: "{{ groups.get('podman', []) | length > 0 }}" + +# The server name for Open OnDemand depends on whether Zenith is enabled or not +openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-') ~ '.sslip.io' }}" +openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}" + +appliances_state_dir: /var/lib/state diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..10fdc926c --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml new file mode 100644 index 000000000..6e3a0cbae --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -0,0 +1,6 @@ +# Skip plotting pingpong as matplotlib not in runner environment +hpctests_pingpong_plot: false + +# In CaaS, the Ansible controller is an ephemeral AWX pod, so all that matters is that +# this is a location that is writable by the container user +hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml new file mode 100644 index 000000000..b21cae962 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -0,0 +1,17 @@ +# Use the IP address instead +nfs_server: "{{ hostvars[groups['control'] | first ].ansible_default_ipv4.address }}" + +nfs_configurations: + - comment: Export /exports/home from Slurm control node as /home + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + nfs_export: "/exports/home" # assumes skeleton TF is being used + nfs_client_mnt_point: "/home" + - comment: Export /var/lib/state from Slurm control node to OOD + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['openondemand'] }}" + nfs_export: "{{ appliances_state_dir }}" + nfs_client_mnt_point: "{{ appliances_state_dir }}" + nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml new file mode 100644 index 000000000..624402f9f --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -0,0 +1,5 @@ +openhpc_cluster_name: "{{ cluster_name }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}" diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml new file mode 100644 index 000000000..9ee960417 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -0,0 +1,75 @@ +--- +openondemand_auth: basic_pam +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" + +httpd_listen_addr_port: + - 80 + - 443 + +# Allow proxying to compute nodes for apps and control for monitoring only when the grafana group is available +openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" + +# Add grafana to dashboard links to OOD only if grafana group is available +openondemand_dashboard_links_grafana: + - name: Grafana + app_name: grafana + category: Monitoring + description: Dashboards + url: "{{ grafana_url_openondemand_proxy }}" +openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if 'grafana' in groups else [] }}" + +# Add grafana panel to jobs page only if grafana group is available +openondemand_clusters: + slurm: + v2: + metadata: + title: "{{ openhpc_cluster_name }}" # interpolation here works as openondemand is lexically after openhpc + login: + host: "{{ hostvars[groups['login'].0].api_address }}" + default: true + job: + adapter: slurm + cluster: "{{ openhpc_cluster_name }}" + batch_connect: + basic: + script_wrapper: |- + module purge + export PATH=/opt/jupyter/bin/:$PATH + %s + set_host: host=$(hostname -s) + vnc: + script_wrapper: |- + module purge + export PATH=/opt/TurboVNC/bin:$PATH + # Workaround to avoid "Unable to contact settings server" when + # lauching xfce4-session + xfce4-session() { /bin/dbus-launch /bin/xfce4-session $@ ; } + export -f xfce4-session + %s + set_host: host=$(hostname -s) + custom: "{{ openondemand_clusters_grafana if 'grafana' in groups else {} }}" + +grafana_address: "{{ hostvars[groups['grafana'][0]]['api_address'] if 'grafana' in groups else '' }}" +grafana_url_openondemand_proxy: "https://{{ openondemand_servername }}/node/{{ groups['grafana'][0] if 'grafana' in groups else '' }}/{{ grafana_port }}" + +openondemand_clusters_grafana: + # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support + grafana: + host: "{{ grafana_url_openondemand_proxy if 'openondemand' in groups else grafana_url_direct }}" + orgId: 1 + dashboard: + name: "node-exporter-slurm" + uid: "node-exporter-slurm" + panels: + cpu: 77 + memory: 78 + labels: + cluster: "cluster" + host: "host" + jobid: "jobid" + +_opeonondemand_unset_auth: ' RequestHeader unset Authorization' + +# Fix grafana proxying for basic auth if anonymous grafana access enabled: +openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and 'grafana' in groups and hostvars[groups['grafana'][0]]._grafana_auth_is_anonymous) else '' }}" diff --git a/environments/.caas/inventory/group_vars/all/prometheus.yml b/environments/.caas/inventory/group_vars/all/prometheus.yml new file mode 100644 index 000000000..3ea282893 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/prometheus.yml @@ -0,0 +1,11 @@ +--- +# Override openondemand_address because its needed in openondemand_scrape_configs +# which is used in prometheus_scrape_configs +openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if 'openondemand' in groups else '' }}" + +# Override group_var set in ansible-slurm-appliance all group - unless +# OOD is being deployed then there won't be an OOD group +prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if ( 'openondemand' in groups ) else [] ) }}" + +# Set Prometheus storage retention size +prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" \ No newline at end of file diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml new file mode 100644 index 000000000..1f1098126 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/selinux.yml @@ -0,0 +1 @@ +selinux_state: disabled \ No newline at end of file diff --git a/environments/.caas/inventory/group_vars/all/zenith.yml b/environments/.caas/inventory/group_vars/all/zenith.yml new file mode 100644 index 000000000..56dd0ca16 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/zenith.yml @@ -0,0 +1 @@ +zenith_proxy_podman_user: podman diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml new file mode 100644 index 000000000..e8a99007c --- /dev/null +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -0,0 +1,29 @@ +# The default Terraform state key for backends that support it +terraform_state_key: "cluster/{{ cluster_id }}/tfstate" + +# Set up the terraform backend +terraform_backend_type: "{{ 'consul' if 'CONSUL_HTTP_ADDR' in ansible_env else 'local' }}" +terraform_backend_config_defaults: + consul: + path: "{{ terraform_state_key }}" + gzip: "true" + local: {} +terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}" + +terraform_binary_directory: "{{ playbook_dir }}/bin" +terraform_binary_path: "{{ terraform_binary_directory }}/terraform" +terraform_project_path: "{{ playbook_dir }}/terraform" + +terraform_state: "{{ cluster_state | default('present') }}" +cluster_ssh_user: rocky + +# Set the size of the state volume to metrics_db_maximum_size + 10 +state_volume_size: "{{ metrics_db_maximum_size + 10 }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: + - name: "standard" + count: "{{ compute_count }}" + flavor: "{{ compute_flavor }}" + default: "YES" diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups new file mode 100644 index 000000000..84e6e5a72 --- /dev/null +++ b/environments/.caas/inventory/groups @@ -0,0 +1,68 @@ +[nfs:children] +openhpc + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager:children] +control + +[node_exporter:children] +cluster + +[opensearch:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack + +[update:children] +cluster + +[fail2ban:children] +# Hosts to install fail2ban on to protect SSH +login + +[block_devices:children] +# Environment-specific so not defined here + +[basic_users] +# Add `openhpc` group to add Slurm users via creation of users on each node. + +[openondemand:children] +# Host to run Open Ondemand server on - subset of login +login + +[openondemand_desktop:children] +# Subset of compute to run a interactive desktops on via Open Ondemand +compute + +[openondemand_jupyter:children] +# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand +compute + +[etc_hosts] +# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md + +[cuda] +# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md + +[eessi:children] +openhpc + +[resolv_conf] +# Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md + +[proxy] +# Hosts to configure http/s proxies - see ansible/roles/proxy/README.md From db7f6de2b42fe64175f83484ce4a5b70092ffc74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Nov 2023 14:42:32 +0000 Subject: [PATCH 08/48] remove requirements.yml symlink --- ansible/roles/requirements.yml | 1 - 1 file changed, 1 deletion(-) delete mode 120000 ansible/roles/requirements.yml diff --git a/ansible/roles/requirements.yml b/ansible/roles/requirements.yml deleted file mode 120000 index 6e76d5252..000000000 --- a/ansible/roles/requirements.yml +++ /dev/null @@ -1 +0,0 @@ -../requirements.yml \ No newline at end of file From 105c97f6a65b5d3e36d0c119b52155e07d31bbd8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 7 Nov 2023 12:22:01 +0000 Subject: [PATCH 09/48] debug inventory --- ansible/slurm-infra.yml | 261 +++++++++++++++++++++------------------- 1 file changed, 135 insertions(+), 126 deletions(-) diff --git a/ansible/slurm-infra.yml b/ansible/slurm-infra.yml index 95a309a8a..a1e541014 100644 --- a/ansible/slurm-infra.yml +++ b/ansible/slurm-infra.yml @@ -1,140 +1,149 @@ --- -# Provision the infrastructure using Terraform -- name: Provision infrastructure - hosts: openstack - roles: - - cluster_infra - -# Setup tasks now that all hosts have been added to the correct groups -- hosts: cluster - become: yes +- hosts: localhost + become: no + gather_facts: no tasks: - # Ensure that the hosts in the cluster can all refer to each other by their hostname - - name: Populate /etc/hosts with cluster hosts - lineinfile: - path: /etc/hosts - regexp: "{{ hostvars[host].inventory_hostname }}" - line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" - loop: "{{ ansible_play_hosts }}" - loop_control: - loop_var: host + - debug: + msg: | + ansible_inventory_sources: {{ ansible_inventory_sources }} + ansible_config_file: {{ ansible_config_file }} -# Ensure that the secrets are generated and persisted on the control host -- name: Generate and persist secrets - hosts: control - gather_facts: no - become: yes - roles: - - persist_openhpc_secrets +# # Provision the infrastructure using Terraform +# - name: Provision infrastructure +# hosts: openstack +# roles: +# - cluster_infra -# validate.yml asserts presence of a control group which doesn't exist when -# destroying infra, so only validate when we're not destroying -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml - when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") +# # Setup tasks now that all hosts have been added to the correct groups +# - hosts: cluster +# become: yes +# tasks: +# # Ensure that the hosts in the cluster can all refer to each other by their hostname +# - name: Populate /etc/hosts with cluster hosts +# lineinfile: +# path: /etc/hosts +# regexp: "{{ hostvars[host].inventory_hostname }}" +# line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" +# loop: "{{ ansible_play_hosts }}" +# loop_control: +# loop_var: host -# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run -# This can disrupt the SSH connection, particularly because we use the login host as a jump host -# So we move the home directory on the login node and reset the connections first -- hosts: login - gather_facts: false - tasks: - - name: Set up Ansible user - user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" - become_method: "sudo" - # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' - become: true +# # Ensure that the secrets are generated and persisted on the control host +# - name: Generate and persist secrets +# hosts: control +# gather_facts: no +# become: yes +# roles: +# - persist_openhpc_secrets -- hosts: cluster - gather_facts: no - tasks: - - name: Reset persistent SSH connections - meta: reset_connection +# # validate.yml asserts presence of a control group which doesn't exist when +# # destroying infra, so only validate when we're not destroying +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml +# when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml +# # The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# # This can disrupt the SSH connection, particularly because we use the login host as a jump host +# # So we move the home directory on the login node and reset the connections first +# - hosts: login +# gather_facts: false +# tasks: +# - name: Set up Ansible user +# user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" +# become_method: "sudo" +# # Need to change working directory otherwise we try to switch back to non-existent directory. +# become_flags: '-i' +# become: true -- name: Persist login hostkey across rebuilds -# Need NFS for this so can't do it before the appliance plays - hosts: login - gather_facts: no - become: yes - roles: - - persist_hostkeys +# - hosts: cluster +# gather_facts: no +# tasks: +# - name: Reset persistent SSH connections +# meta: reset_connection -# Configure the Zenith clients that are required -# First, ensure that podman is installed on all hosts that will run Zenith clients -- hosts: zenith,!podman - tasks: - - import_role: - name: podman - tasks_from: prereqs.yml - - import_role: - name: podman - tasks_from: config.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml -- hosts: grafana - tasks: - - name: Deploy the Zenith client for Grafana - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-monitoring - # Use the IP address for the upstream host - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: "{{ grafana_port }}" - zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" - zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" - when: zenith_subdomain_monitoring is defined +# - name: Persist login hostkey across rebuilds +# # Need NFS for this so can't do it before the appliance plays +# hosts: login +# gather_facts: no +# become: yes +# roles: +# - persist_hostkeys -- hosts: openondemand - tasks: - - name: Deploy the Zenith client for OOD - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-ood - # Use the IP address for the upstream host - zenith_proxy_upstream_scheme: https - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: 443 - zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: azimuth - zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" - when: zenith_subdomain_ood is defined +# # Configure the Zenith clients that are required +# # First, ensure that podman is installed on all hosts that will run Zenith clients +# - hosts: zenith,!podman +# tasks: +# - import_role: +# name: podman +# tasks_from: prereqs.yml +# - import_role: +# name: podman +# tasks_from: config.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml +# - hosts: grafana +# tasks: +# - name: Deploy the Zenith client for Grafana +# include_role: +# name: zenith_proxy +# vars: +# zenith_proxy_service_name: zenith-monitoring +# # Use the IP address for the upstream host +# zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" +# zenith_proxy_upstream_port: "{{ grafana_port }}" +# zenith_proxy_client_token: "{{ zenith_token_monitoring }}" +# zenith_proxy_client_auth_params: +# tenancy-id: "{{ openstack_project_id }}" +# zenith_proxy_mitm_enabled: yes +# zenith_proxy_mitm_auth_inject: basic +# zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" +# zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" +# when: zenith_subdomain_monitoring is defined -# Write the outputs as the final task -- hosts: localhost - tasks: - - debug: var=outputs - vars: - # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, - # so we have to repeat logic here unfortunately - outputs: >- - {{- - { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | - combine( - { - "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", - "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password - } - if zenith_fqdn_ood is not defined - else {} - ) - }} +# - hosts: openondemand +# tasks: +# - name: Deploy the Zenith client for OOD +# include_role: +# name: zenith_proxy +# vars: +# zenith_proxy_service_name: zenith-ood +# # Use the IP address for the upstream host +# zenith_proxy_upstream_scheme: https +# zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" +# zenith_proxy_upstream_port: 443 +# zenith_proxy_client_token: "{{ zenith_token_ood }}" +# zenith_proxy_client_auth_params: +# tenancy-id: "{{ openstack_project_id }}" +# zenith_proxy_mitm_enabled: yes +# zenith_proxy_mitm_auth_inject: basic +# zenith_proxy_mitm_auth_basic_username: azimuth +# zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" +# when: zenith_subdomain_ood is defined + +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml + +# # Write the outputs as the final task +# - hosts: localhost +# tasks: +# - debug: var=outputs +# vars: +# # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, +# # so we have to repeat logic here unfortunately +# outputs: >- +# {{- +# { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | +# combine( +# { +# "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", +# "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password +# } +# if zenith_fqdn_ood is not defined +# else {} +# ) +# }} From d2e71f959d99fbf5fbfd8e0520b5f6065ab370ae Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 7 Nov 2023 15:47:40 +0000 Subject: [PATCH 10/48] remove inventory from caas ansible.cfg now in cluster spec --- ansible.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/ansible.cfg b/ansible.cfg index d54bf087c..12a570a21 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -4,7 +4,6 @@ any_errors_fatal = True gathering = smart forks = 30 host_key_checking = False -inventory = environments/common/inventory,environments/.caas/inventory remote_tmp = /tmp collections_path = ansible/collections roles_path = ansible/roles From fdb432c723257116650804e127f426ec9d53292e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Nov 2023 11:37:39 +0000 Subject: [PATCH 11/48] debugging env --- ansible/noop.yml | 4 +- ansible/slurm-infra.yml | 19 ++++ environments/.caas/hooks/pre.yml | 143 +++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 environments/.caas/hooks/pre.yml diff --git a/ansible/noop.yml b/ansible/noop.yml index 49317736a..2f0beebb0 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -6,4 +6,6 @@ - hosts: localhost gather_facts: false - tasks: [] \ No newline at end of file + tasks: + - debug: + msg: "got a noop" diff --git a/ansible/slurm-infra.yml b/ansible/slurm-infra.yml index a1e541014..1bca11c36 100644 --- a/ansible/slurm-infra.yml +++ b/ansible/slurm-infra.yml @@ -8,6 +8,25 @@ msg: | ansible_inventory_sources: {{ ansible_inventory_sources }} ansible_config_file: {{ ansible_config_file }} + APPLIANCES_ENVIRONMENT_ROOT: {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} + appliances_environment_root: {{ appliances_environment_root }} + PWD: {{ lookup('env', 'PWD' )}}" + - command: + cmd: "realpath ." + - stat: + path: environments/.caas/hooks/pre.yml + - stat: + path: ../environments/.caas/hooks/pre.yml + - stat: + path: project/environments/.caas/hooks/pre.yml + + +- name: DEBUG - try pre.yml hook + vars: + appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" + import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" + when: hook_path | exists # # Provision the infrastructure using Terraform # - name: Provision infrastructure diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml new file mode 100644 index 000000000..1e7c6c78b --- /dev/null +++ b/environments/.caas/hooks/pre.yml @@ -0,0 +1,143 @@ +--- + +- name: DEBUG: RUNNING PRE + meta: end_here + +# Provision the infrastructure using Terraform +- name: Provision infrastructure + hosts: openstack + roles: + - cluster_infra + +# Setup tasks now that all hosts have been added to the correct groups +- hosts: cluster + become: yes + tasks: + # Ensure that the hosts in the cluster can all refer to each other by their hostname + - name: Populate /etc/hosts with cluster hosts + lineinfile: + path: /etc/hosts + regexp: "{{ hostvars[host].inventory_hostname }}" + line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" + loop: "{{ ansible_play_hosts }}" + loop_control: + loop_var: host + +# Ensure that the secrets are generated and persisted on the control host +- name: Generate and persist secrets + hosts: control + gather_facts: no + become: yes + roles: + - persist_openhpc_secrets + +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml + when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") + +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# This can disrupt the SSH connection, particularly because we use the login host as a jump host +# So we move the home directory on the login node and reset the connections first +- hosts: login + gather_facts: false + tasks: + - name: Set up Ansible user + user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + become: true + +- hosts: cluster + gather_facts: no + tasks: + - name: Reset persistent SSH connections + meta: reset_connection + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml + +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} From c23cd202a307dde93b2be55faaf885567fe12810 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Nov 2023 13:08:08 +0000 Subject: [PATCH 12/48] debug --- ansible/noop.yml | 2 +- ansible/slurm-infra.yml | 18 ++++++++++-------- environments/.caas/hooks/pre.yml | 3 --- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ansible/noop.yml b/ansible/noop.yml index 2f0beebb0..0e5f54169 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -8,4 +8,4 @@ gather_facts: false tasks: - debug: - msg: "got a noop" + msg: "got a noop on hook_path: {{ hook_path }}" diff --git a/ansible/slurm-infra.yml b/ansible/slurm-infra.yml index 1bca11c36..66c6ea9ad 100644 --- a/ansible/slurm-infra.yml +++ b/ansible/slurm-infra.yml @@ -11,15 +11,17 @@ APPLIANCES_ENVIRONMENT_ROOT: {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_root: {{ appliances_environment_root }} PWD: {{ lookup('env', 'PWD' )}}" - - command: - cmd: "realpath ." - stat: - path: environments/.caas/hooks/pre.yml - - stat: - path: ../environments/.caas/hooks/pre.yml - - stat: - path: project/environments/.caas/hooks/pre.yml - + path: "{{ hook_path }}" + vars: + hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" + - debug: + msg: | + environments/.caas/hooks/pre.yml: {{ 'environments/.caas/hooks/pre.yml' | exists }} + ../environments/.caas/hooks/pre.yml: {{ '../environments/.caas/hooks/pre.yml' | exists }} + project/environments/.caas/hooks/pre.yml: {{ 'project/environments/.caas/hooks/pre.yml' | exists }} + vars: + hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" - name: DEBUG - try pre.yml hook vars: diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 1e7c6c78b..95a309a8a 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,8 +1,5 @@ --- -- name: DEBUG: RUNNING PRE - meta: end_here - # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack From 435860936ee2c6af85e3b9bb5e4ae01011ddc935 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Nov 2023 16:01:13 +0000 Subject: [PATCH 13/48] use site instead of slurm-infra --- ansible/slurm-infra.yml | 170 ------------------------------ environments/.caas/hooks/post.yml | 79 ++++++++++++++ environments/.caas/hooks/pre.yml | 97 ++--------------- 3 files changed, 85 insertions(+), 261 deletions(-) delete mode 100644 ansible/slurm-infra.yml create mode 100644 environments/.caas/hooks/post.yml diff --git a/ansible/slurm-infra.yml b/ansible/slurm-infra.yml deleted file mode 100644 index 66c6ea9ad..000000000 --- a/ansible/slurm-infra.yml +++ /dev/null @@ -1,170 +0,0 @@ ---- - -- hosts: localhost - become: no - gather_facts: no - tasks: - - debug: - msg: | - ansible_inventory_sources: {{ ansible_inventory_sources }} - ansible_config_file: {{ ansible_config_file }} - APPLIANCES_ENVIRONMENT_ROOT: {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} - appliances_environment_root: {{ appliances_environment_root }} - PWD: {{ lookup('env', 'PWD' )}}" - - stat: - path: "{{ hook_path }}" - vars: - hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" - - debug: - msg: | - environments/.caas/hooks/pre.yml: {{ 'environments/.caas/hooks/pre.yml' | exists }} - ../environments/.caas/hooks/pre.yml: {{ '../environments/.caas/hooks/pre.yml' | exists }} - project/environments/.caas/hooks/pre.yml: {{ 'project/environments/.caas/hooks/pre.yml' | exists }} - vars: - hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" - -- name: DEBUG - try pre.yml hook - vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" - hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" - import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" - when: hook_path | exists - -# # Provision the infrastructure using Terraform -# - name: Provision infrastructure -# hosts: openstack -# roles: -# - cluster_infra - -# # Setup tasks now that all hosts have been added to the correct groups -# - hosts: cluster -# become: yes -# tasks: -# # Ensure that the hosts in the cluster can all refer to each other by their hostname -# - name: Populate /etc/hosts with cluster hosts -# lineinfile: -# path: /etc/hosts -# regexp: "{{ hostvars[host].inventory_hostname }}" -# line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" -# loop: "{{ ansible_play_hosts }}" -# loop_control: -# loop_var: host - -# # Ensure that the secrets are generated and persisted on the control host -# - name: Generate and persist secrets -# hosts: control -# gather_facts: no -# become: yes -# roles: -# - persist_openhpc_secrets - -# # validate.yml asserts presence of a control group which doesn't exist when -# # destroying infra, so only validate when we're not destroying -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml -# when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") - -# # The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run -# # This can disrupt the SSH connection, particularly because we use the login host as a jump host -# # So we move the home directory on the login node and reset the connections first -# - hosts: login -# gather_facts: false -# tasks: -# - name: Set up Ansible user -# user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" -# become_method: "sudo" -# # Need to change working directory otherwise we try to switch back to non-existent directory. -# become_flags: '-i' -# become: true - -# - hosts: cluster -# gather_facts: no -# tasks: -# - name: Reset persistent SSH connections -# meta: reset_connection - -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml - -# - name: Persist login hostkey across rebuilds -# # Need NFS for this so can't do it before the appliance plays -# hosts: login -# gather_facts: no -# become: yes -# roles: -# - persist_hostkeys - -# # Configure the Zenith clients that are required -# # First, ensure that podman is installed on all hosts that will run Zenith clients -# - hosts: zenith,!podman -# tasks: -# - import_role: -# name: podman -# tasks_from: prereqs.yml -# - import_role: -# name: podman -# tasks_from: config.yml - -# - hosts: grafana -# tasks: -# - name: Deploy the Zenith client for Grafana -# include_role: -# name: zenith_proxy -# vars: -# zenith_proxy_service_name: zenith-monitoring -# # Use the IP address for the upstream host -# zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" -# zenith_proxy_upstream_port: "{{ grafana_port }}" -# zenith_proxy_client_token: "{{ zenith_token_monitoring }}" -# zenith_proxy_client_auth_params: -# tenancy-id: "{{ openstack_project_id }}" -# zenith_proxy_mitm_enabled: yes -# zenith_proxy_mitm_auth_inject: basic -# zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" -# zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" -# when: zenith_subdomain_monitoring is defined - -# - hosts: openondemand -# tasks: -# - name: Deploy the Zenith client for OOD -# include_role: -# name: zenith_proxy -# vars: -# zenith_proxy_service_name: zenith-ood -# # Use the IP address for the upstream host -# zenith_proxy_upstream_scheme: https -# zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" -# zenith_proxy_upstream_port: 443 -# zenith_proxy_client_token: "{{ zenith_token_ood }}" -# zenith_proxy_client_auth_params: -# tenancy-id: "{{ openstack_project_id }}" -# zenith_proxy_mitm_enabled: yes -# zenith_proxy_mitm_auth_inject: basic -# zenith_proxy_mitm_auth_basic_username: azimuth -# zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" -# when: zenith_subdomain_ood is defined - -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml - -# # Write the outputs as the final task -# - hosts: localhost -# tasks: -# - debug: var=outputs -# vars: -# # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, -# # so we have to repeat logic here unfortunately -# outputs: >- -# {{- -# { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | -# combine( -# { -# "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", -# "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password -# } -# if zenith_fqdn_ood is not defined -# else {} -# ) -# }} diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml new file mode 100644 index 000000000..e6c3da917 --- /dev/null +++ b/environments/.caas/hooks/post.yml @@ -0,0 +1,79 @@ +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} \ No newline at end of file diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 95a309a8a..7d2a34b53 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -28,11 +28,13 @@ roles: - persist_openhpc_secrets -# validate.yml asserts presence of a control group which doesn't exist when -# destroying infra, so only validate when we're not destroying -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml - when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") +# TODO: FIXME: +# # validate.yml asserts presence of a control group which doesn't exist when +# # destroying infra, so only validate when we're not destroying +# - import_playbook: ansible/validate.yml +# when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") +# TODO: FIXME: maybe by doing the user move in cloud-init? # The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run # This can disrupt the SSH connection, particularly because we use the login host as a jump host # So we move the home directory on the login node and reset the connections first @@ -51,90 +53,3 @@ tasks: - name: Reset persistent SSH connections meta: reset_connection - -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml - -- name: Persist login hostkey across rebuilds -# Need NFS for this so can't do it before the appliance plays - hosts: login - gather_facts: no - become: yes - roles: - - persist_hostkeys - -# Configure the Zenith clients that are required -# First, ensure that podman is installed on all hosts that will run Zenith clients -- hosts: zenith,!podman - tasks: - - import_role: - name: podman - tasks_from: prereqs.yml - - import_role: - name: podman - tasks_from: config.yml - -- hosts: grafana - tasks: - - name: Deploy the Zenith client for Grafana - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-monitoring - # Use the IP address for the upstream host - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: "{{ grafana_port }}" - zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" - zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" - when: zenith_subdomain_monitoring is defined - -- hosts: openondemand - tasks: - - name: Deploy the Zenith client for OOD - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-ood - # Use the IP address for the upstream host - zenith_proxy_upstream_scheme: https - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: 443 - zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: azimuth - zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" - when: zenith_subdomain_ood is defined - -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml - -# Write the outputs as the final task -- hosts: localhost - tasks: - - debug: var=outputs - vars: - # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, - # so we have to repeat logic here unfortunately - outputs: >- - {{- - { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | - combine( - { - "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", - "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password - } - if zenith_fqdn_ood is not defined - else {} - ) - }} From 00bd97f2b544b3ffc3c25685ab7ea1cdaa4beb05 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Nov 2023 16:08:07 +0000 Subject: [PATCH 14/48] debug --- environments/.caas/hooks/post.yml | 3 ++- environments/.caas/hooks/pre.yml | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index e6c3da917..bbb14fcec 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -56,7 +56,8 @@ zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" when: zenith_subdomain_ood is defined -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml +# TODO: FIXME +# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml # Write the outputs as the final task - hosts: localhost diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 7d2a34b53..4debc88bd 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -6,6 +6,16 @@ roles: - cluster_infra +- name: DEBUG - show host groups + hosts: all + become: no + gather_facts: no + tasks: + - debug: + var: group_names + - meta: end_here + +# FIXME: replace with etc_hosts # Setup tasks now that all hosts have been added to the correct groups - hosts: cluster become: yes From 0936390b161bc5a14044f945825bc6e4e8c2c6d8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Nov 2023 16:36:55 +0000 Subject: [PATCH 15/48] add openstack host definition --- environments/.caas/inventory/hosts | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 environments/.caas/inventory/hosts diff --git a/environments/.caas/inventory/hosts b/environments/.caas/inventory/hosts new file mode 100644 index 000000000..88ce71000 --- /dev/null +++ b/environments/.caas/inventory/hosts @@ -0,0 +1,2 @@ +[openstack] +localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3 From 7c192b8c6eca3f8e85b37ce0e99c69c1d5f83ed7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 11:25:37 +0000 Subject: [PATCH 16/48] remove ControlPath definition for caas --- ansible.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible.cfg b/ansible.cfg index 12a570a21..09c5b9fb9 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -11,7 +11,7 @@ filter_plugins = ansible/filter_plugins callbacks_enabled = ansible.posix.profile_tasks [ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True # This is important because we are using one of the hosts in the play as a jump host # This ensures that if the proxy connection is interrupted, rendering the other hosts From a27bcbbc97713339765ca46bd6bbc23c971a1d34 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 11:39:27 +0000 Subject: [PATCH 17/48] don't run validation in caas on destroy --- ansible/site.yml | 1 + environments/.caas/hooks/pre.yml | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ansible/site.yml b/ansible/site.yml index 37befa547..e2731c860 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -8,6 +8,7 @@ when: hook_path | exists - import_playbook: validate.yml + when: "{{ appliances_validate | default(true) }}" - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 4debc88bd..b12a86d70 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -38,11 +38,15 @@ roles: - persist_openhpc_secrets -# TODO: FIXME: -# # validate.yml asserts presence of a control group which doesn't exist when -# # destroying infra, so only validate when we're not destroying -# - import_playbook: ansible/validate.yml -# when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- hosts: openstack + gather_facts: no + become: no + tasks: + - set_fact: + appliances_validate: false + when: "cluster_state | default('') == 'absent'" # TODO: FIXME: maybe by doing the user move in cloud-init? # The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run From 2933daf8e4686020ac0c81c3af957f491a2161d2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 11:40:28 +0000 Subject: [PATCH 18/48] use etc_hosts role instead of custom task for caas --- ansible/bootstrap.yml | 4 +++ environments/.caas/hooks/pre.yml | 33 ++++++++++++----------- environments/.caas/inventory/extra_groups | 2 ++ 3 files changed, 23 insertions(+), 16 deletions(-) create mode 100644 environments/.caas/inventory/extra_groups diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 485c637f5..aa89b3f9b 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -28,6 +28,10 @@ tasks: - import_role: name: etc_hosts + - name: DEBUG /etc/hosts + command: + cmd: cat /etc/hosts + - meta: end_here - hosts: proxy gather_facts: false diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index b12a86d70..e66d8787b 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -6,29 +6,30 @@ roles: - cluster_infra -- name: DEBUG - show host groups +- name: DEBUG hosts: all become: no gather_facts: no tasks: - debug: var: group_names - - meta: end_here + - debug: + msg: "{{ hostvars[inventory_hostname] | to_nice_yaml }}" -# FIXME: replace with etc_hosts -# Setup tasks now that all hosts have been added to the correct groups -- hosts: cluster - become: yes - tasks: - # Ensure that the hosts in the cluster can all refer to each other by their hostname - - name: Populate /etc/hosts with cluster hosts - lineinfile: - path: /etc/hosts - regexp: "{{ hostvars[host].inventory_hostname }}" - line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" - loop: "{{ ansible_play_hosts }}" - loop_control: - loop_var: host +# # FIXME: replace with etc_hosts +# # Setup tasks now that all hosts have been added to the correct groups +# - hosts: cluster +# become: yes +# tasks: +# # Ensure that the hosts in the cluster can all refer to each other by their hostname +# - name: Populate /etc/hosts with cluster hosts +# lineinfile: +# path: /etc/hosts +# regexp: "{{ hostvars[host].inventory_hostname }}" +# line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" +# loop: "{{ ansible_play_hosts }}" +# loop_control: +# loop_var: host # Ensure that the secrets are generated and persisted on the control host - name: Generate and persist secrets diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups new file mode 100644 index 000000000..0b8bfc1ba --- /dev/null +++ b/environments/.caas/inventory/extra_groups @@ -0,0 +1,2 @@ +[etc_hosts:children] +cluster From 0665aeb8276ce2032914915406b9f564d7b33549 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 12:07:34 +0000 Subject: [PATCH 19/48] remove add_host building of groups for caas --- ansible/roles/cluster_infra/defaults/main.yml | 53 ------------------- ansible/roles/cluster_infra/tasks/main.yml | 34 ------------ .../cluster_infra/templates/outputs.tf.j2 | 6 +-- environments/.caas/hooks/pre.yml | 9 ++-- environments/.caas/inventory/extra_groups | 5 ++ 5 files changed, 14 insertions(+), 93 deletions(-) diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml index e22c86255..ef8ea609b 100644 --- a/ansible/roles/cluster_infra/defaults/main.yml +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -1,56 +1,3 @@ -##### -## WARNING -## -## The groups specified here should replicate the groups in the StackHPC Slurm appliance environments -## -## https://github.com/stackhpc/ansible-slurm-appliance/blob/main/environments/common/inventory/groups -## https://github.com/stackhpc/ansible-slurm-appliance/blob/main/environments/common/layouts/everything -##### -# These groups should represent the everything layout -cluster_groups_required: - login: ["{{ cluster_name }}_login"] - control: ["{{ cluster_name }}_control"] - compute: ["{{ cluster_name }}_compute"] - openhpc: [login, control, compute] - cluster: [openhpc] - selinux: [cluster] - nfs: [cluster] - mysql: [control] - update: [cluster] - basic_users: [cluster] - fail2ban: [login] - firewalld: [fail2ban] - # ignore these for the moment: - #etc_hosts: [] - # cloud_init: [etc_hosts] - systemd: [opensearch, grafana, control, prometheus] - -# These are the additional groups required for monitoring (see everything layout) -cluster_groups_monitoring: - podman: [opensearch, filebeat, mysql] - prometheus: [control] - grafana: [control] - alertmanager: [control] - node_exporter: [cluster] - opensearch: [control] - slurm_stats: [control] - filebeat: [slurm_stats] - -# Additional groups for OOD -cluster_groups_ood: - openondemand: [login] - openondemand_jupyter: [compute] - openondemand_desktop: [compute] - -# Additional groups for running the cluster validation -cluster_groups_validation: - hpctests: [login] - -# Additional groups for Zenith support -cluster_groups_zenith: - # Any hosts in the grafana and openondemand groups should go in the zenith group - zenith: [grafana, openondemand] - cluster_deploy_ssh_keys_extra: [] # List of hw_scsi_models that result in block devices presenting as /dev/sdX diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml index 796411fb6..cacf7b6a0 100644 --- a/ansible/roles/cluster_infra/tasks/main.yml +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -102,37 +102,3 @@ - name: Provision infrastructure include_role: name: stackhpc.terraform.infra - -# The hosts provisioned by Terraform are put into a primary group by the role -# These tasks then add those hosts to additional groups depending on the selected options -- name: Add cluster hosts to required groups - add_host: - name: "{{ item }}" - groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_required) }}" - loop: "{{ groups.get('cluster', []) }}" - -- name: Add cluster hosts to OOD groups - add_host: - name: "{{ item }}" - groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_ood) }}" - loop: "{{ groups.get('cluster', []) }}" - -- name: Add cluster hosts to monitoring groups - add_host: - name: "{{ item }}" - groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_monitoring) }}" - loop: "{{ groups.get('cluster', []) }}" - -- name: Add cluster hosts to validation groups - add_host: - name: "{{ item }}" - groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_validation) }}" - loop: "{{ groups.get('cluster', []) }}" - when: cluster_run_validation | default(false) | bool - -- name: Add cluster hosts to Zenith groups - add_host: - name: "{{ item }}" - groups: "{{ hostvars[item].group_names | stackhpc.terraform.terraform_infra_expand_groups(cluster_groups_zenith) }}" - loop: "{{ groups.get('cluster', []) }}" - when: zenith_subdomain_monitoring is defined diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 index 75f46717c..70b57d119 100644 --- a/ansible/roles/cluster_infra/templates/outputs.tf.j2 +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -18,7 +18,7 @@ output "cluster_nodes" { { name = openstack_compute_instance_v2.login.name ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4 - groups = ["{{ cluster_name }}_login"], + groups = ["login", "{{ cluster_name }}_login"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id } @@ -26,7 +26,7 @@ output "cluster_nodes" { { name = openstack_compute_instance_v2.control.name ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 - groups = ["{{ cluster_name }}_control"], + groups = ["control", "{{ cluster_name }}_control"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id } @@ -37,7 +37,7 @@ output "cluster_nodes" { for compute in openstack_compute_instance_v2.{{ partition.name }}: { name = compute.name ip = compute.network[0].fixed_ip_v4 - groups = ["{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], + groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id } diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index e66d8787b..7ad017e65 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -11,10 +11,13 @@ become: no gather_facts: no tasks: - - debug: + - name: debug groups + debug: var: group_names - - debug: - msg: "{{ hostvars[inventory_hostname] | to_nice_yaml }}" + # - name: debug hostvars + # debug: + # msg: "{{ hostvars[inventory_hostname] | to_nice_yaml }}" + # - meta: end_here # # FIXME: replace with etc_hosts # # Setup tasks now that all hosts have been added to the correct groups diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups index 0b8bfc1ba..cfa56c823 100644 --- a/environments/.caas/inventory/extra_groups +++ b/environments/.caas/inventory/extra_groups @@ -1,2 +1,7 @@ [etc_hosts:children] cluster + +[zenith:children] +# TODO: originally only when: zenith_subdomain_monitoring is defined +grafana +openondemand From 6fe51e615d17051ccdfb33c8b1457870148eb571 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 13:45:30 +0000 Subject: [PATCH 20/48] finish /etc/host caas fixes to use internal_address --- ansible/bootstrap.yml | 4 ---- environments/.caas/hooks/post.yml | 4 ++-- environments/.caas/hooks/pre.yml | 15 --------------- .../.caas/inventory/group_vars/all/nfs.yml | 3 --- 4 files changed, 2 insertions(+), 24 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index aa89b3f9b..485c637f5 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -28,10 +28,6 @@ tasks: - import_role: name: etc_hosts - - name: DEBUG /etc/hosts - command: - cmd: cat /etc/hosts - - meta: end_here - hosts: proxy gather_facts: false diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index bbb14fcec..8926701e6 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -25,7 +25,7 @@ vars: zenith_proxy_service_name: zenith-monitoring # Use the IP address for the upstream host - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_host: "{{ internal_address }}" zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" zenith_proxy_client_auth_params: @@ -45,7 +45,7 @@ zenith_proxy_service_name: zenith-ood # Use the IP address for the upstream host zenith_proxy_upstream_scheme: https - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" + zenith_proxy_upstream_host: "{{ internal_address }}" zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" zenith_proxy_client_auth_params: diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 7ad017e65..c99267277 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -19,21 +19,6 @@ # msg: "{{ hostvars[inventory_hostname] | to_nice_yaml }}" # - meta: end_here -# # FIXME: replace with etc_hosts -# # Setup tasks now that all hosts have been added to the correct groups -# - hosts: cluster -# become: yes -# tasks: -# # Ensure that the hosts in the cluster can all refer to each other by their hostname -# - name: Populate /etc/hosts with cluster hosts -# lineinfile: -# path: /etc/hosts -# regexp: "{{ hostvars[host].inventory_hostname }}" -# line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" -# loop: "{{ ansible_play_hosts }}" -# loop_control: -# loop_var: host - # Ensure that the secrets are generated and persisted on the control host - name: Generate and persist secrets hosts: control diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index b21cae962..fc5dcf7d7 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,6 +1,3 @@ -# Use the IP address instead -nfs_server: "{{ hostvars[groups['control'] | first ].ansible_default_ipv4.address }}" - nfs_configurations: - comment: Export /exports/home from Slurm control node as /home nfs_enable: From 8eb5827775ad8f489353fde6c6976f64a88ac093 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 13:46:01 +0000 Subject: [PATCH 21/48] fix hpctests for caas --- environments/.caas/hooks/post.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index 8926701e6..8b6b6aafc 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -56,8 +56,14 @@ zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" when: zenith_subdomain_ood is defined -# TODO: FIXME -# - import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml +# Run hpctests if set in UI +- hosts: hpctests[0] + become: false + gather_facts: false + tasks: + - import_role: + name: hpctests + when: cluster_run_validation | default(false) | bool # Write the outputs as the final task - hosts: localhost From 8ca9dc783a20a4d6ffe4f81263888ad893d373ac Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 14:01:33 +0000 Subject: [PATCH 22/48] fix NFS for caas --- environments/.caas/inventory/group_vars/all/nfs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index fc5dcf7d7..2ea3abe57 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,3 +1,5 @@ +nfs_server: "{{ nfs_server_default }}" + nfs_configurations: - comment: Export /exports/home from Slurm control node as /home nfs_enable: From 45d0c1bcf0de013f4c5a2987c7fd57cc39e73132 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 14:17:45 +0000 Subject: [PATCH 23/48] fix secrets for caas --- ansible/bootstrap.yml | 1 + ansible/site.yml | 1 + ansible/validate.yml | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 485c637f5..b6d60b3a9 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -99,6 +99,7 @@ name: fail2ban - name: Setup podman + gather_facts: false hosts: podman tags: podman tasks: diff --git a/ansible/site.yml b/ansible/site.yml index e2731c860..a0efc4045 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -9,6 +9,7 @@ - import_playbook: validate.yml when: "{{ appliances_validate | default(true) }}" + - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook diff --git a/ansible/validate.yml b/ansible/validate.yml index d294e98e5..6a482e03a 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -4,6 +4,7 @@ - name: Ensure control node is in inventory hosts: all + gather_facts: false tasks: - assert: that: groups['control'] | length @@ -11,6 +12,7 @@ - name: Validate openhpc configuration hosts: openhpc + gather_facts: false tags: openhpc tasks: - assert: @@ -22,6 +24,7 @@ - name: Validate podman configuration hosts: podman + gather_facts: false tags: podman tasks: - import_role: @@ -31,6 +34,7 @@ - name: Validate filebeat configuration hosts: filebeat + gather_facts: false tags: filebeat tasks: - import_role: From 5418d4f7e381c33b0beef0e20742198b2eea02ca Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 15:53:13 +0000 Subject: [PATCH 24/48] add basic_users to caas for azimuth user --- environments/.caas/inventory/extra_groups | 4 +++- environments/.caas/inventory/group_vars/all/basic_users.yml | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 environments/.caas/inventory/group_vars/all/basic_users.yml diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups index cfa56c823..a6f06b7a7 100644 --- a/environments/.caas/inventory/extra_groups +++ b/environments/.caas/inventory/extra_groups @@ -1,7 +1,9 @@ +[basic_users:children] +cluster + [etc_hosts:children] cluster [zenith:children] -# TODO: originally only when: zenith_subdomain_monitoring is defined grafana openondemand diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..6105df821 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,6 @@ +basic_users_users: + - name: azimuth + # Hash the password with a salt that is different for each host + password: "{{ vault_azimuth_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" + uid: 1005 + public_key: "{{ cluster_user_ssh_public_key }}" From 018b17abbf71eef2a439f80a8481f21936f3a54c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 16:57:44 +0000 Subject: [PATCH 25/48] remove debugging --- environments/.caas/hooks/pre.yml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index c99267277..05b0255c8 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -6,19 +6,6 @@ roles: - cluster_infra -- name: DEBUG - hosts: all - become: no - gather_facts: no - tasks: - - name: debug groups - debug: - var: group_names - # - name: debug hostvars - # debug: - # msg: "{{ hostvars[inventory_hostname] | to_nice_yaml }}" - # - meta: end_here - # Ensure that the secrets are generated and persisted on the control host - name: Generate and persist secrets hosts: control From f88d24e73767c8f7c1ebbce7bf0bb69aea13861b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 17:01:33 +0000 Subject: [PATCH 26/48] remove debugging --- ansible/noop.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ansible/noop.yml b/ansible/noop.yml index 0e5f54169..adad24813 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -6,6 +6,4 @@ - hosts: localhost gather_facts: false - tasks: - - debug: - msg: "got a noop on hook_path: {{ hook_path }}" + tasks: [] From 046335b46b465a7dc562af4df18ebc30a0e17994 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 17:01:52 +0000 Subject: [PATCH 27/48] remove slurm-infra-fast-volume-type.yml pending discussion --- ansible/slurm-infra-fast-volume-type.yml | 140 ----------------------- 1 file changed, 140 deletions(-) delete mode 100644 ansible/slurm-infra-fast-volume-type.yml diff --git a/ansible/slurm-infra-fast-volume-type.yml b/ansible/slurm-infra-fast-volume-type.yml deleted file mode 100644 index 95a309a8a..000000000 --- a/ansible/slurm-infra-fast-volume-type.yml +++ /dev/null @@ -1,140 +0,0 @@ ---- - -# Provision the infrastructure using Terraform -- name: Provision infrastructure - hosts: openstack - roles: - - cluster_infra - -# Setup tasks now that all hosts have been added to the correct groups -- hosts: cluster - become: yes - tasks: - # Ensure that the hosts in the cluster can all refer to each other by their hostname - - name: Populate /etc/hosts with cluster hosts - lineinfile: - path: /etc/hosts - regexp: "{{ hostvars[host].inventory_hostname }}" - line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}" - loop: "{{ ansible_play_hosts }}" - loop_control: - loop_var: host - -# Ensure that the secrets are generated and persisted on the control host -- name: Generate and persist secrets - hosts: control - gather_facts: no - become: yes - roles: - - persist_openhpc_secrets - -# validate.yml asserts presence of a control group which doesn't exist when -# destroying infra, so only validate when we're not destroying -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml - when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") - -# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run -# This can disrupt the SSH connection, particularly because we use the login host as a jump host -# So we move the home directory on the login node and reset the connections first -- hosts: login - gather_facts: false - tasks: - - name: Set up Ansible user - user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" - become_method: "sudo" - # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' - become: true - -- hosts: cluster - gather_facts: no - tasks: - - name: Reset persistent SSH connections - meta: reset_connection - -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml - -- name: Persist login hostkey across rebuilds -# Need NFS for this so can't do it before the appliance plays - hosts: login - gather_facts: no - become: yes - roles: - - persist_hostkeys - -# Configure the Zenith clients that are required -# First, ensure that podman is installed on all hosts that will run Zenith clients -- hosts: zenith,!podman - tasks: - - import_role: - name: podman - tasks_from: prereqs.yml - - import_role: - name: podman - tasks_from: config.yml - -- hosts: grafana - tasks: - - name: Deploy the Zenith client for Grafana - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-monitoring - # Use the IP address for the upstream host - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: "{{ grafana_port }}" - zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" - zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" - when: zenith_subdomain_monitoring is defined - -- hosts: openondemand - tasks: - - name: Deploy the Zenith client for OOD - include_role: - name: zenith_proxy - vars: - zenith_proxy_service_name: zenith-ood - # Use the IP address for the upstream host - zenith_proxy_upstream_scheme: https - zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}" - zenith_proxy_upstream_port: 443 - zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" - zenith_proxy_mitm_enabled: yes - zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: azimuth - zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" - when: zenith_subdomain_ood is defined - -- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml - -# Write the outputs as the final task -- hosts: localhost - tasks: - - debug: var=outputs - vars: - # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, - # so we have to repeat logic here unfortunately - outputs: >- - {{- - { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | - combine( - { - "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", - "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password - } - if zenith_fqdn_ood is not defined - else {} - ) - }} From ea3b86b09292017d9a88302b5d3ba35a5fbabebd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 17:07:07 +0000 Subject: [PATCH 28/48] move azimuth ui-meta inside caas environment --- .../.caas/ui-meta}/slurm-infra-fast-volume-type.yml | 0 {ui-meta => environments/.caas/ui-meta}/slurm-infra.yml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {ui-meta => environments/.caas/ui-meta}/slurm-infra-fast-volume-type.yml (100%) rename {ui-meta => environments/.caas/ui-meta}/slurm-infra.yml (100%) diff --git a/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml similarity index 100% rename from ui-meta/slurm-infra-fast-volume-type.yml rename to environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml diff --git a/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml similarity index 100% rename from ui-meta/slurm-infra.yml rename to environments/.caas/ui-meta/slurm-infra.yml From ff71ee09250154b79c8d3247568088bd4a43b8c1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 17:12:57 +0000 Subject: [PATCH 29/48] remove unused files after cloud-init role removal --- ansible/.gitignore | 2 -- ansible/adhoc/template-cloud-init.yml | 9 --------- environments/.caas/cloud_init/.gitkeep | 0 environments/.stackhpc/cloud_init/.gitkeep | 0 .../{{cookiecutter.environment}}/cloud_init/.gitkeep | 0 5 files changed, 11 deletions(-) delete mode 100644 ansible/adhoc/template-cloud-init.yml delete mode 100644 environments/.caas/cloud_init/.gitkeep delete mode 100644 environments/.stackhpc/cloud_init/.gitkeep delete mode 100644 environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep diff --git a/ansible/.gitignore b/ansible/.gitignore index 4bf56fe3b..47a79a28a 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -28,8 +28,6 @@ roles/* !roles/firewalld/** !roles/etc_hosts/ !roles/etc_hosts/** -!roles/cloud_init/ -!roles/cloud_init/** !roles/mysql/ !roles/mysql/** !roles/systemd/ diff --git a/ansible/adhoc/template-cloud-init.yml b/ansible/adhoc/template-cloud-init.yml deleted file mode 100644 index 92bb14a5d..000000000 --- a/ansible/adhoc/template-cloud-init.yml +++ /dev/null @@ -1,9 +0,0 @@ -- hosts: cloud_init - become: no - gather_facts: no - tasks: - - name: Template out cloud-init userdata - import_role: - name: cloud_init - tasks_from: template.yml - delegate_to: localhost diff --git a/environments/.caas/cloud_init/.gitkeep b/environments/.caas/cloud_init/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/.stackhpc/cloud_init/.gitkeep b/environments/.stackhpc/cloud_init/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep b/environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep deleted file mode 100644 index e69de29bb..000000000 From 2b264c74bed3d19bbe17edd40dd2ecf417f0086b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 18:09:55 +0000 Subject: [PATCH 30/48] converge common and caas ondemand config --- .../inventory/group_vars/all/openondemand.yml | 66 ------------------- .../inventory/group_vars/all/openondemand.yml | 36 +++++----- 2 files changed, 20 insertions(+), 82 deletions(-) diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml index 9ee960417..60461bd61 100644 --- a/environments/.caas/inventory/group_vars/all/openondemand.yml +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -7,69 +7,3 @@ httpd_listen_addr_port: - 80 - 443 -# Allow proxying to compute nodes for apps and control for monitoring only when the grafana group is available -openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" - -# Add grafana to dashboard links to OOD only if grafana group is available -openondemand_dashboard_links_grafana: - - name: Grafana - app_name: grafana - category: Monitoring - description: Dashboards - url: "{{ grafana_url_openondemand_proxy }}" -openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if 'grafana' in groups else [] }}" - -# Add grafana panel to jobs page only if grafana group is available -openondemand_clusters: - slurm: - v2: - metadata: - title: "{{ openhpc_cluster_name }}" # interpolation here works as openondemand is lexically after openhpc - login: - host: "{{ hostvars[groups['login'].0].api_address }}" - default: true - job: - adapter: slurm - cluster: "{{ openhpc_cluster_name }}" - batch_connect: - basic: - script_wrapper: |- - module purge - export PATH=/opt/jupyter/bin/:$PATH - %s - set_host: host=$(hostname -s) - vnc: - script_wrapper: |- - module purge - export PATH=/opt/TurboVNC/bin:$PATH - # Workaround to avoid "Unable to contact settings server" when - # lauching xfce4-session - xfce4-session() { /bin/dbus-launch /bin/xfce4-session $@ ; } - export -f xfce4-session - %s - set_host: host=$(hostname -s) - custom: "{{ openondemand_clusters_grafana if 'grafana' in groups else {} }}" - -grafana_address: "{{ hostvars[groups['grafana'][0]]['api_address'] if 'grafana' in groups else '' }}" -grafana_url_openondemand_proxy: "https://{{ openondemand_servername }}/node/{{ groups['grafana'][0] if 'grafana' in groups else '' }}/{{ grafana_port }}" - -openondemand_clusters_grafana: - # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support - grafana: - host: "{{ grafana_url_openondemand_proxy if 'openondemand' in groups else grafana_url_direct }}" - orgId: 1 - dashboard: - name: "node-exporter-slurm" - uid: "node-exporter-slurm" - panels: - cpu: 77 - memory: 78 - labels: - cluster: "cluster" - host: "host" - jobid: "jobid" - -_opeonondemand_unset_auth: ' RequestHeader unset Authorization' - -# Fix grafana proxying for basic auth if anonymous grafana access enabled: -openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and 'grafana' in groups and hostvars[groups['grafana'][0]]._grafana_auth_is_anonymous) else '' }}" diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index c63191095..660b7631c 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -15,12 +15,14 @@ openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_re ondemand_package: ondemand-3.0.1 -openondemand_dashboard_links: # TODO: should really only be deployed if grafana is deployed and proxying configured +# Add grafana to dashboard links to OOD only if grafana group is available +openondemand_dashboard_links_grafana: - name: Grafana app_name: grafana category: Monitoring description: Dashboards url: "{{ grafana_url_openondemand_proxy }}" +openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if groups['grafana'] | length > 0 }}" openondemand_clusters: slurm: @@ -52,21 +54,23 @@ openondemand_clusters: export -f xfce4-session %s set_host: host=$(hostname -s) - custom: - # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support - grafana: - host: "{{ grafana_url }}" - orgId: 1 - dashboard: - name: "node-exporter-slurm" - uid: "node-exporter-slurm" - panels: - cpu: 77 - memory: 78 - labels: - cluster: "cluster" - host: "host" - jobid: "jobid" + custom: "{{ openondemand_clusters_grafana if groups['grafana'] | length > 0 else {} }}" + +openondemand_clusters_grafana: + # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support + grafana: + host: "{{ grafana_url }}" + orgId: 1 + dashboard: + name: "node-exporter-slurm" + uid: "node-exporter-slurm" + panels: + cpu: 77 + memory: 78 + labels: + cluster: "cluster" + host: "host" + jobid: "jobid" ood_install_apps_defaults: jupyter: From 95985f0d7de7f1ff41737f959b510c4bd656408f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 18:14:11 +0000 Subject: [PATCH 31/48] converge caas prom config --- .../.caas/inventory/group_vars/all/prometheus.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/environments/.caas/inventory/group_vars/all/prometheus.yml b/environments/.caas/inventory/group_vars/all/prometheus.yml index 3ea282893..eb28fda63 100644 --- a/environments/.caas/inventory/group_vars/all/prometheus.yml +++ b/environments/.caas/inventory/group_vars/all/prometheus.yml @@ -1,11 +1,4 @@ --- -# Override openondemand_address because its needed in openondemand_scrape_configs -# which is used in prometheus_scrape_configs -openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if 'openondemand' in groups else '' }}" - -# Override group_var set in ansible-slurm-appliance all group - unless -# OOD is being deployed then there won't be an OOD group -prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if ( 'openondemand' in groups ) else [] ) }}" # Set Prometheus storage retention size -prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" \ No newline at end of file +prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" From c7b56bd078e23223bbb21daa811a42b1e775c649 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 18:15:32 +0000 Subject: [PATCH 32/48] fixup comment --- environments/.caas/inventory/group_vars/all/hpctests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml index 6e3a0cbae..a31437be3 100644 --- a/environments/.caas/inventory/group_vars/all/hpctests.yml +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -1,6 +1,6 @@ # Skip plotting pingpong as matplotlib not in runner environment hpctests_pingpong_plot: false -# In CaaS, the Ansible controller is an ephemeral AWX pod, so all that matters is that +# In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that # this is a location that is writable by the container user hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" From 2ba7815dd59c1b56292ddc906f96fcdfb725af47 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 20:11:12 +0000 Subject: [PATCH 33/48] remove unused caas activate script --- environments/.caas/activate | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 environments/.caas/activate diff --git a/environments/.caas/activate b/environments/.caas/activate deleted file mode 100644 index e74031095..000000000 --- a/environments/.caas/activate +++ /dev/null @@ -1,23 +0,0 @@ -export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) -echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" - -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" - -export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") -echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" - -export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" - -export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" - -export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") -echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" - -if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then - export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg -fi - - From 7e07c4151f96bf116d65badd29c63f6a9f06af97 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 20:36:11 +0000 Subject: [PATCH 34/48] base caas repo root off environment root --- environments/.caas/hooks/pre.yml | 10 ++++++++++ .../.caas/inventory/group_vars/all/cluster.yml | 3 +-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 05b0255c8..e7b8db335 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,5 +1,15 @@ --- +- hosts: all + become: no + gather_facts: no + tasks: + - debug: + msg: | + appliances_environment_root: {{ appliances_environment_root }} + appliances_repository_root: {{ appliances_repository_root }} + - meta: end_here + # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml index f416c2782..f1107063c 100644 --- a/environments/.caas/inventory/group_vars/all/cluster.yml +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -1,6 +1,5 @@ # Account for the fact we are running outside of the expected environment system: -# NB: this only works for playbooks in ansible/*, not in ansible/adhoc! -appliances_repository_root: "{{ playbook_dir }}/../" +appliances_repository_root: "{{ appliances_environment_root + '/../../' | normpath }}" # Read the secrets from the Ansible local facts on the control host vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}" From d815ead5a0ce94d044f4da5b1950f52d07392806 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Nov 2023 21:08:01 +0000 Subject: [PATCH 35/48] compute repo and env root for caas --- environments/.caas/hooks/pre.yml | 10 ---------- .../.caas/inventory/group_vars/all/cluster.yml | 4 +++- .../common/inventory/group_vars/all/update.yml | 2 +- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index e7b8db335..05b0255c8 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,15 +1,5 @@ --- -- hosts: all - become: no - gather_facts: no - tasks: - - debug: - msg: | - appliances_environment_root: {{ appliances_environment_root }} - appliances_repository_root: {{ appliances_repository_root }} - - meta: end_here - # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml index f1107063c..b9ea63586 100644 --- a/environments/.caas/inventory/group_vars/all/cluster.yml +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -1,5 +1,7 @@ # Account for the fact we are running outside of the expected environment system: -appliances_repository_root: "{{ appliances_environment_root + '/../../' | normpath }}" +caas_inventory: "{{ ansible_inventory_sources | last }}" # ansible_inventory_sources is absolute +appliances_environment_root: "{{ caas_inventory | dirname }}" +appliances_repository_root: "{{ appliances_environment_root | dirname | dirname }}" # Read the secrets from the Ansible local facts on the control host vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}" diff --git a/environments/common/inventory/group_vars/all/update.yml b/environments/common/inventory/group_vars/all/update.yml index b409ea3d6..715d418c7 100644 --- a/environments/common/inventory/group_vars/all/update.yml +++ b/environments/common/inventory/group_vars/all/update.yml @@ -9,4 +9,4 @@ update_exclude: - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 update_disablerepo: omit # Log changes during update here on localhost: -update_log_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/logs/{{ inventory_hostname }}-updates.log" +update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" From c700bcaa5422372d0b52aabd8dfec20f2a41fa61 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Nov 2023 09:50:15 +0000 Subject: [PATCH 36/48] symlink everything layout to caas --- environments/.caas/inventory/everything | 1 + environments/.caas/inventory/groups | 68 ------------------------- 2 files changed, 1 insertion(+), 68 deletions(-) create mode 120000 environments/.caas/inventory/everything delete mode 100644 environments/.caas/inventory/groups diff --git a/environments/.caas/inventory/everything b/environments/.caas/inventory/everything new file mode 120000 index 000000000..dc66b9576 --- /dev/null +++ b/environments/.caas/inventory/everything @@ -0,0 +1 @@ +../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups deleted file mode 100644 index 84e6e5a72..000000000 --- a/environments/.caas/inventory/groups +++ /dev/null @@ -1,68 +0,0 @@ -[nfs:children] -openhpc - -[mysql:children] -control - -[prometheus:children] -control - -[grafana:children] -control - -[alertmanager:children] -control - -[node_exporter:children] -cluster - -[opensearch:children] -control - -[slurm_stats:children] -control - -[filebeat:children] -slurm_stats - -# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack - -[update:children] -cluster - -[fail2ban:children] -# Hosts to install fail2ban on to protect SSH -login - -[block_devices:children] -# Environment-specific so not defined here - -[basic_users] -# Add `openhpc` group to add Slurm users via creation of users on each node. - -[openondemand:children] -# Host to run Open Ondemand server on - subset of login -login - -[openondemand_desktop:children] -# Subset of compute to run a interactive desktops on via Open Ondemand -compute - -[openondemand_jupyter:children] -# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand -compute - -[etc_hosts] -# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md - -[cuda] -# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md - -[eessi:children] -openhpc - -[resolv_conf] -# Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md - -[proxy] -# Hosts to configure http/s proxies - see ansible/roles/proxy/README.md From 4fbdaa260c79cc1e4d2189631daaf7b21d4566cf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Nov 2023 10:05:35 +0000 Subject: [PATCH 37/48] remove envvar APPLIANCES_ENVIRONMENT_NAME --- environments/.stackhpc/activate | 3 +-- environments/common/inventory/group_vars/all/defaults.yml | 1 + environments/common/inventory/group_vars/all/openondemand.yml | 2 +- environments/skeleton/{{cookiecutter.environment}}/activate | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/environments/.stackhpc/activate b/environments/.stackhpc/activate index e74031095..2a58b40e4 100644 --- a/environments/.stackhpc/activate +++ b/environments/.stackhpc/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index a45d1ac23..23448c80d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -3,6 +3,7 @@ ansible_user: rocky appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" +appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform # Address(ip/dns) for internal communication between services. This is diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 660b7631c..a675279ba 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -178,7 +178,7 @@ openondemand_scrape_configs: - targets: - "{{ openondemand_address }}:9301" labels: - environment: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_NAME') }}" + environment: "{{ appliances_environment_name }}" service: "openondemand" openondemand_dashboard: diff --git a/environments/skeleton/{{cookiecutter.environment}}/activate b/environments/skeleton/{{cookiecutter.environment}}/activate index e74031095..2a58b40e4 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/activate +++ b/environments/skeleton/{{cookiecutter.environment}}/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" From 06ccb87c9d29186edf1b86a2bfb7e394880a5318 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Nov 2023 10:15:50 +0000 Subject: [PATCH 38/48] fix zenith proxying --- environments/.caas/hooks/post.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index 8b6b6aafc..7aafe409f 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -25,7 +25,7 @@ vars: zenith_proxy_service_name: zenith-monitoring # Use the IP address for the upstream host - zenith_proxy_upstream_host: "{{ internal_address }}" + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" zenith_proxy_client_auth_params: @@ -45,7 +45,7 @@ zenith_proxy_service_name: zenith-ood # Use the IP address for the upstream host zenith_proxy_upstream_scheme: https - zenith_proxy_upstream_host: "{{ internal_address }}" + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" zenith_proxy_client_auth_params: From 9dc381e92d924ebbf5d2b23aa89d5fdd4c101895 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Nov 2023 15:56:23 +0000 Subject: [PATCH 39/48] fix hooks to not require APPLIANCES_ENVIRONMENT_ROOT env var --- ansible/site.yml | 9 ++++++--- environments/.caas/hooks/pre.yml | 10 ++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ansible/site.yml b/ansible/site.yml index 13cf708c2..1804a2365 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,7 +2,8 @@ - name: Run pre.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists @@ -14,7 +15,8 @@ - name: Run post-bootstrap.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists @@ -28,7 +30,8 @@ - name: Run post.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 05b0255c8..f0ab51f54 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,5 +1,15 @@ --- +- name: debug + hosts: + - localhost + - openstack + become: no + gather_facts: no + tasks: + - debug: + msg: "Starting pre-hook" + # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack From 1d75ab226e909281ca1991932f8e4706c0bba625 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Nov 2023 16:58:49 +0000 Subject: [PATCH 40/48] remove image build related roles - currently unused --- ansible/roles/image_build/defaults/main.yml | 38 -------- ansible/roles/image_build/tasks/main.yml | 84 ---------------- ansible/roles/image_build/tasks/prechecks.yml | 22 ----- .../image_build/templates/ansible.cfg.j2 | 15 --- .../templates/builder.pkrvars.hcl.j2 | 38 -------- .../roles/image_build_infra/defaults/main.yml | 12 --- .../roles/image_build_infra/tasks/main.yml | 45 --------- .../image_build_infra/templates/outputs.tf.j2 | 27 ------ .../templates/providers.tf.j2 | 10 -- .../templates/resources.tf.j2 | 97 ------------------- 10 files changed, 388 deletions(-) delete mode 100644 ansible/roles/image_build/defaults/main.yml delete mode 100644 ansible/roles/image_build/tasks/main.yml delete mode 100644 ansible/roles/image_build/tasks/prechecks.yml delete mode 100644 ansible/roles/image_build/templates/ansible.cfg.j2 delete mode 100644 ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 delete mode 100644 ansible/roles/image_build_infra/defaults/main.yml delete mode 100644 ansible/roles/image_build_infra/tasks/main.yml delete mode 100644 ansible/roles/image_build_infra/templates/outputs.tf.j2 delete mode 100644 ansible/roles/image_build_infra/templates/providers.tf.j2 delete mode 100644 ansible/roles/image_build_infra/templates/resources.tf.j2 diff --git a/ansible/roles/image_build/defaults/main.yml b/ansible/roles/image_build/defaults/main.yml deleted file mode 100644 index 37a3503d7..000000000 --- a/ansible/roles/image_build/defaults/main.yml +++ /dev/null @@ -1,38 +0,0 @@ ---- -# Attach a floating IP to the Packer build instance -image_build_attach_floating_ip: false - -# Use a volume for the root disk of the Packer build instance -image_build_use_blockstorage_volume: false - -# Packer image format (only used when image_build_use_blockstorage_volume: true -image_build_image_disk_format: "qcow2" - -# Metadata items to set on the Packer image -image_build_metadata: {} - -# The directory that contains the openstack.pkr.hcl to build the Slurm image -image_build_packer_root_path: "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/packer" - -# The appliances_environment_root directory. This may contain a hooks directory -# optionally containing pre.yml, post-bootstrap.yml and post.yml playbooks, to -# run during the image-build process -image_build_appliances_environment_root: "{{ playbook_dir }}/image-build" - -# Vars to apply to the builder group -image_build_builder_group_vars: - update_log_path: /tmp/update_log - appliances_repository_root: "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance" - -# ansible_ssh_common_args for Packer build -image_build_ansible_ssh_common_args: >- - {% if image_build_ssh_bastion_host is defined %} - '-o ProxyCommand="ssh -W %h:%p -q - {% if image_build_ssh_bastion_private_key_file is defined %} - -i {{ image_build_ssh_bastion_private_key_file }} - {% endif %} - -l {{ image_build_ssh_bastion_username }} - {{ image_build_ssh_bastion_host }}"' - {% else %} - "" - {% endif %} diff --git a/ansible/roles/image_build/tasks/main.yml b/ansible/roles/image_build/tasks/main.yml deleted file mode 100644 index 5dc9b6450..000000000 --- a/ansible/roles/image_build/tasks/main.yml +++ /dev/null @@ -1,84 +0,0 @@ ---- - -- name: Run prechecks - include_tasks: prechecks.yml - -- name: Create temporary file for pkrvars.hcl - ansible.builtin.tempfile: - state: file - suffix: .pkrvars.hcl - register: pkrvars_hcl_file - -- name: Make Packer vars file - template: - src: builder.pkrvars.hcl.j2 - dest: "{{ pkrvars_hcl_file.path }}" - -- name: Create temporary image-build inventory directory - ansible.builtin.tempfile: - state: directory - prefix: image-build. - register: image_build_inventory - -- name: Symlink "everything" layout to image-build inventory - file: - state: link - src: "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/environments/common/layouts/everything" - dest: "{{ image_build_inventory.path }}/groups" - -- name: Symlink CAAS group_vars to image-build inventory - file: - state: link - src: "{{ playbook_dir }}/group_vars" - dest: "{{ image_build_inventory.path }}/group_vars" - -- name: Add builder vars to image-build inventory hosts file - copy: - dest: "{{ image_build_inventory.path }}/hosts" - content: | - {% raw %} - localhost ansible_connection=local ansible_python_interpreter="{{ ansible_playbook_python }}" - {% endraw %} - [builder:vars] - {% if image_build_ssh_bastion_host is defined %} - ansible_ssh_common_args={{ image_build_ansible_ssh_common_args }} - {% endif %} - {% for k,v in image_build_builder_group_vars.items() -%} - {{ k }}={{ v }} - {% endfor -%} - -- name: Create temporary file for ansible.cfg - ansible.builtin.tempfile: - state: file - suffix: ansible.cfg - register: ansible_cfg_file - -- name: Template image-build ansible.cfg - template: - src: ansible.cfg.j2 - dest: "{{ ansible_cfg_file.path }}" - -- name: Packer init - command: - cmd: | - packer init . - chdir: "{{ image_build_packer_root_path }}" - -- name: Build image with packer - command: - cmd: | - packer build -only openstack.openhpc -var-file={{ pkrvars_hcl_file.path }} openstack.pkr.hcl - chdir: "{{ image_build_packer_root_path }}" - environment: - APPLIANCES_ENVIRONMENT_ROOT: "{{ image_build_appliances_environment_root }}" - ANSIBLE_CONFIG: "{{ ansible_cfg_file.path }}" - PACKER_LOG: "1" - PACKER_LOG_PATH: "{{ lookup('ansible.builtin.env', 'PACKER_LOG_PATH', default='/tmp/packer-build.log') }}" - -- name: Parse packer-manifest.json - set_fact: - packer_manifest: "{{ lookup('file', '/tmp/builder.manifest.json') | from_json }}" - -- name: Extract image-build data - set_fact: - image_build_data: "{{ packer_manifest.builds | selectattr('packer_run_uuid', 'eq', packer_manifest.last_run_uuid) | first }}" diff --git a/ansible/roles/image_build/tasks/prechecks.yml b/ansible/roles/image_build/tasks/prechecks.yml deleted file mode 100644 index 38f1ff15e..000000000 --- a/ansible/roles/image_build/tasks/prechecks.yml +++ /dev/null @@ -1,22 +0,0 @@ ---- - -- name: Check required vars are defined - assert: - that: - - "{{ item }} is defined" - fail_msg: "{{ item }} is not defined" - loop: - - image_build_network_id - - image_build_floating_ip_network - - image_build_source_image_id - - image_build_security_group_id - -- name: Ensure builder access mode - fail: - msg: >- - Set either image_build_ssh_bastion_host or - image_build_attach_floating_ip to access the image - build instance via a bastion or directly - when: - - image_build_ssh_bastion_host is defined - - image_build_attach_floating_ip is defined and image_build_attach_floating_ip diff --git a/ansible/roles/image_build/templates/ansible.cfg.j2 b/ansible/roles/image_build/templates/ansible.cfg.j2 deleted file mode 100644 index acfd294ab..000000000 --- a/ansible/roles/image_build/templates/ansible.cfg.j2 +++ /dev/null @@ -1,15 +0,0 @@ -[defaults] -any_errors_fatal = True -gathering = smart -host_key_checking = False -remote_tmp = /tmp -roles_path = {{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/ansible/roles -inventory = {{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/environments/common/inventory,{{ image_build_inventory.path }} - -[ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null -pipelining = True -# This is important because we are using one of the hosts in the play as a jump host -# This ensures that if the proxy connection is interrupted, rendering the other hosts -# unreachable, the connection is retried instead of failing the entire play -retries = 10 diff --git a/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 b/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 deleted file mode 100644 index d1753225b..000000000 --- a/ansible/roles/image_build/templates/builder.pkrvars.hcl.j2 +++ /dev/null @@ -1,38 +0,0 @@ -repo_root = "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance" -environment_root = "{{ playbook_dir }}/image_build" -networks = ["{{ image_build_network_id }}"] -{% if image_build_ssh_username is defined %} -ssh_username = "{{ image_build_ssh_username }}" -{% endif %} -{% if image_build_ssh_bastion_host is defined %} -ssh_bastion_host = "{{ image_build_ssh_bastion_host }}" -{% endif %} -{% if image_build_ssh_bastion_username is defined %} -ssh_bastion_username = "{{ image_build_ssh_bastion_username }}" -{% endif %} -{% if image_build_ssh_bastion_private_key_file is defined %} -ssh_bastion_private_key_file = "{{ image_build_ssh_bastion_private_key_file }}" -{% endif %} -{% if image_build_attach_floating_ip %} -floating_ip_network = "{{ image_build_floating_ip_network }}" -{% endif %} -security_groups = ["{{ image_build_security_group_id }}"] -fatimage_source_image = "{{ image_build_source_image_id }}" -{% if image_build_ssh_keypair_name is defined %} -ssh_keypair_name = "{{ image_build_ssh_keypair_name }}" -{% endif %} -{% if image_build_ssh_private_key_file is defined %} -ssh_private_key_file = "{{ image_build_ssh_private_key_file }}" -{% endif %} -flavor = "{{ image_build_flavor_name }}" -metadata = { -{% for k,v in image_build_metadata.items() %} - "{{ k }}" = "{{ v }}" -{% endfor %} -} -use_blockstorage_volume = {{ image_build_use_blockstorage_volume | string | lower }} -{% if image_build_use_blockstorage_volume %} -volume_size = {{ image_build_volume_size }} -image_disk_format = "{{ image_build_image_disk_format }}" -{% endif %} -manifest_output_path = "/tmp/builder.manifest.json" diff --git a/ansible/roles/image_build_infra/defaults/main.yml b/ansible/roles/image_build_infra/defaults/main.yml deleted file mode 100644 index adce2f827..000000000 --- a/ansible/roles/image_build_infra/defaults/main.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- - -image_build_terraform_project_path: "{{ playbook_dir }}/terraform-caas-image-build" -image_build_cluster_name: "caas-image-build" - -# Regex to capture existing cloud image names to use as the -# OpenHPC Slurm base-image -image_build_existing_image_regex: "^Rocky-8-GenericCloud-Base-8.8-.*" -# Attributes to sort the list of existing base images returned by -# image_build_existing_image_regex. See -# https://registry.terraform.io/providers/terraform-provider-openstack/openstack/latest/docs/data-sources/images_image_ids_v2#sort -image_build_existing_image_sort_attributes: "name,updated_at" diff --git a/ansible/roles/image_build_infra/tasks/main.yml b/ansible/roles/image_build_infra/tasks/main.yml deleted file mode 100644 index 17dbc8566..000000000 --- a/ansible/roles/image_build_infra/tasks/main.yml +++ /dev/null @@ -1,45 +0,0 @@ ---- -- name: Install Terraform binary - include_role: - name: stackhpc.terraform.install - -- name: Make Terraform project directory - file: - path: "{{ image_build_terraform_project_path }}" - state: directory - -- name: Write backend configuration - copy: - content: | - terraform { - backend "{{ terraform_backend_type }}" { } - } - dest: "{{ image_build_terraform_project_path }}/backend.tf" - -- name: Template Terraform files into project directory - template: - src: "{{ item }}.j2" - dest: "{{ image_build_terraform_project_path }}/{{ item }}" - loop: - - outputs.tf - - providers.tf - - resources.tf - -- name: Provision infrastructure using Terraform - terraform: - binary_path: "{{ terraform_binary_path or omit }}" - project_path: "{{ image_build_terraform_project_path }}" - state: "{{ terraform_state }}" - backend_config: "{{ terraform_backend_config }}" - force_init: yes - init_reconfigure: yes - variables: "{{ image_build_terraform_variables | default(omit) }}" - register: image_build_terraform_provision - -- name: Set image build infrastructure facts - set_fact: - image_build_network_id: "{{ image_build_terraform_provision.outputs.network_id.value }}" - image_build_floating_ip_network: "{{ image_build_terraform_provision.outputs.floating_ip_network_id.value }}" - image_build_source_image_id: "{{ image_build_terraform_provision.outputs.source_image_name.value }}" - image_build_security_group_id: "{{ image_build_terraform_provision.outputs.security_group_id.value }}" - when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") diff --git a/ansible/roles/image_build_infra/templates/outputs.tf.j2 b/ansible/roles/image_build_infra/templates/outputs.tf.j2 deleted file mode 100644 index 447ae9653..000000000 --- a/ansible/roles/image_build_infra/templates/outputs.tf.j2 +++ /dev/null @@ -1,27 +0,0 @@ -output "network_id" { - description = "The image build network ID" - value = data.openstack_networking_network_v2.caas_image_build_network.id -} - -output "source_image_name" { - description = "The id of the image used to build the cluster nodes" - {% if image_build_source_image_id is defined %} - value = "{{ image_build_source_image_id }}" - {% else %} - value = data.openstack_images_image_ids_v2.image_build_source_image.ids[0] - {% endif %} -} - -output "floating_ip_network_id" { - description = "Network to allocate floating IPs from" - value = data.openstack_networking_network_v2.caas_image_build_external_network.id -} - -output "security_group_id" { - description = "Security group ID to associate with the builder instance" - {% if image_build_security_group_id is defined %} - value = "{{ image_build_security_group_id }}" - {% else %} - value = openstack_networking_secgroup_v2.caas_image_build_secgroup.id - {% endif %} -} diff --git a/ansible/roles/image_build_infra/templates/providers.tf.j2 b/ansible/roles/image_build_infra/templates/providers.tf.j2 deleted file mode 100644 index 32a16f27b..000000000 --- a/ansible/roles/image_build_infra/templates/providers.tf.j2 +++ /dev/null @@ -1,10 +0,0 @@ -terraform { - required_version = ">= 0.14" - - # We need the OpenStack provider - required_providers { - openstack = { - source = "terraform-provider-openstack/openstack" - } - } -} diff --git a/ansible/roles/image_build_infra/templates/resources.tf.j2 b/ansible/roles/image_build_infra/templates/resources.tf.j2 deleted file mode 100644 index 0f8233ae9..000000000 --- a/ansible/roles/image_build_infra/templates/resources.tf.j2 +++ /dev/null @@ -1,97 +0,0 @@ -#jinja2: trim_blocks:False - -###### -###### Image build network -###### - -data "openstack_networking_network_v2" "caas_image_build_external_network" { - external = true -} - -{% if image_build_network_id is not defined %} -{% if image_build_network_name is not defined %} -# Create a network -resource "openstack_networking_network_v2" "caas_image_build_network" { - name = "{{ image_build_cluster_name }}" - admin_state_up = "true" -} - -resource "openstack_networking_subnet_v2" "caas_image_build_subnet" { - name = "{{ image_build_cluster_name }}" - network_id = "${openstack_networking_network_v2.caas_image_build_network.id}" - cidr = "192.168.244.0/24" - {% if image_build_nameservers is defined %} - dns_nameservers = [ - {% for nameserver in image_build_nameservers %} - "{{ nameserver }}"{{ ',' if not loop.last }} - {% endfor %} - ] - {% endif %} - ip_version = 4 -} - -resource "openstack_networking_router_v2" "caas_image_build_router" { - name = "{{ image_build_cluster_name }}" - admin_state_up = true - external_network_id = "${data.openstack_networking_network_v2.caas_image_build_external_network.id}" -} - -resource "openstack_networking_router_interface_v2" "caas_image_build_router_interface" { - router_id = "${openstack_networking_router_v2.caas_image_build_router.id}" - subnet_id = "${openstack_networking_subnet_v2.caas_image_build_subnet.id}" -} -{% endif %} -{% endif %} - -# Get existing network resource data by name, from either the created -# network or the network name if supplied -data "openstack_networking_network_v2" "caas_image_build_network" { - {% if image_build_network_id is defined %} - network_id = "{{ image_build_network_id }}" - {% elif image_build_network_name is defined %} - name = "{{ image_build_network_name }}" - {% else %} - network_id = "${openstack_networking_network_v2.caas_image_build_network.id}" - {% endif %} -} - -{% if image_build_source_image_id is not defined %} -###### -###### Image build base image -###### - -data "openstack_images_image_ids_v2" "image_build_source_image" { - name_regex = "{{ image_build_existing_image_regex }}" - sort = "{{ image_build_existing_image_sort_attributes }}" -} -{% endif %} - -{% if image_build_security_group_id is not defined %} -###### -###### Image build security groups -###### - -# Security group to hold specific rules for the image build instance -resource "openstack_networking_secgroup_v2" "caas_image_build_secgroup" { - name = "{{ image_build_cluster_name }}" - description = "Specific rules for caas image build" - delete_default_rules = true # Fully manage with terraform -} - -## Allow all egress for the image build instance -resource "openstack_networking_secgroup_rule_v2" "caas_image_build_secgroup_egress_v4" { - direction = "egress" - ethertype = "IPv4" - security_group_id = "${openstack_networking_secgroup_v2.caas_image_build_secgroup.id}" -} - -## Allow ingress on port 22 (SSH) from anywhere for the image build instance -resource "openstack_networking_secgroup_rule_v2" "caas_image_build_secgroup_ingress_ssh_v4" { - direction = "ingress" - ethertype = "IPv4" - protocol = "tcp" - port_range_min = 22 - port_range_max = 22 - security_group_id = "${openstack_networking_secgroup_v2.caas_image_build_secgroup.id}" -} -{% endif %} \ No newline at end of file From 28af9092770b66f03ee988e9dac1fd1b4358da6a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 16 Nov 2023 18:03:57 +0000 Subject: [PATCH 41/48] move ood icon into this repo --- environments/.caas/assets/ood-icon.png | Bin 0 -> 11306 bytes environments/.caas/ui-meta/slurm-infra.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 environments/.caas/assets/ood-icon.png diff --git a/environments/.caas/assets/ood-icon.png b/environments/.caas/assets/ood-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..b5f4b6ea78505e6f68e6972313467d42a34d3e87 GIT binary patch literal 11306 zcmeHtWmH?+)^;fFP_$?oid%4Zr?^{7LkJLDg10!7QrwCccXxMafl|Cs9Euhx6bcl5 z)1GthJ!gFHpF76){yWLo$=+){^O&N{dHhr zAV2A(!fOBk3MC&sBZLmb6X@*fWCgc}0TJHLFd)neZUq2%%~zyZd(em^hu%TS>d?IE zXy)9kFgvb)a)A|Ho7Hs)FP8^B;H6 zhd%5sS*te$Tt>HHM<&waQckklDyvnvNrdy`UUY{mh%eh&0oQCA^imn0XgCFKzspI% zHov4^{n*>GYs~e3@#s0W>#f)I=H6g|x%Fn)?NloCg~W+eS611`SVX7%9mZX`&!K=D znS_;Wra8rr&cv(8vAF1BUBkXfy|q-TQt_`%$EGD;RaiO14pu!|x5Rfvxmq_{#T-}O z?J_-%^rJTSTE976>ok%(9@VVfmYuE=>x}IqV^hfzI^em8X=wJHw<>C}F3a#1k^D-F z=~nu>>qfx;D%5Qn=i}!~V)jp)9YLeKi6S?fP|N;IL#-+NqpS11nD12YpXnsdfTFsi zw}}{9=#_~ab5ZHl#3dzmhJ9!Qh+Mw*TbKb+Ek{E)YU>8B)@cIE9~IXj=yT~Uy9+ib zUZ3)Ws5$OD(ik<2XruozbW*$K-W{oyA1n7z2=QPjf=645(ZII2DAq!iwJ6@!wq!dN zsv)u+j~>wxa$jAA@r{lxykJ%fsxC0AW9QHyv~n*y)x@!2=99^{j5@cCk4<9qJ1;JX z6={Yd#fOXYq9j@0KCn*nPBykqfAN!y{G#SNTUlk}rANBXyh~B0=V$k(CBg!aAd%zz z1Wl12^;1%{XR3QmD_!r?d|{(2YP-Rk40i1%!&gKa&hd$S_qWs$mhCL)r1gd$?cI5N zU>YKIqf=xAm@v8V7}I#JvZ z< z=!=a(qdco?B-d~G?rkL;@|Lhk)*7|F<^%psi?_W?2&_PQ&3k27l$odtrAlIU!gJ-d zxm2!KPHVh19nqz|jJ8xklen3R*B~G;_wzxQ4ELvl{IC$;AD9V=@EP z$hsqN{wi?8l;Olxopr0tTwrw{z97=*b8Dt&clVS4PX_&C4CgMJ~C zXYeWMM0S688DaW$W9#d{LE&}fNoaJF)u+CdX^C&zZ9R+(;z!WhXYYvn=)7Bxq#xv)lC0O$MWr zuTAKTQst|?EQfEoRGAG^l4eVs9g=DqFG!O$&>2Oa<=4$O&(!E*w+7TMtbYAT$N8Wn zKXaYyY?^$s(0a_rB>(I6?$*28HJ*u&8BU|^gm}}>O`u8pv!E9 z?U5(?KBK5dF6ZSIPx=VAi?zwo_yqqooJgJiIb>o!15a4o#Xh#V#gwq5Ewd{T>yl+4 zt|fft=CD|>^aKsl6-7=gTWcIto|?sGJKbVdXtlSln`|KJ;z`z-1D@cX7J!F7d;C5s zzj245V#T-g)C`S@V6<^WLJTXFAp6A3c7)7+w9uuMfElGB%gDVOKxO~B4i|qoqq_%@ zx3iy@Ic=!L>!~FPxYvs|DPLD+gb*@1424SO`&YhnirEr!swM)Nu_AmFj^uEKR8LsK zk1mNZ^PgJaxcmqjA7+JXXA`7`1bW&9c*>-?jiyU(ysqChQJ)yY^5K26BO2R;U9W?# zXDEb=ts@=@I4uZQqn`=%?N3@?GX6a5h-MNiX~DZIANDWBRWqf!(SPIM%KS_Fua@yr+@L zF-R$Who9vNSctcextL8D;_J50rl-rVez6$BnLL9<{W{-{$l`q|xrMH@28)>0$2cwZ ztE_lM)X0pOT}pp5Q{mFlxV$31E16VIw9g?AT_3ZQg%ITx?VLP)8&qVfN2e?;yF$BaCr_m0%C#%%u;i;FTU?oIyvY$u;<~kWy}^b}NUz}T$7MzU zTR-EBO>8||@#8{NKRJqdZR-AQ&Q*yxAxHG8<_Zndz%5g4#l>Fox{$P#J6C^W@J7fa zNXPFN9ikGUeyfYFW?hsJccQ=pSde~*LSV%ZA$$!F0cnbILNrDqdqaoasIucEUnkl} zCxQIb;q$Cg{Fdk+wa@en(KTk$pIN8BPr_w}^&vz~0i`)51I`RAS;{en$KKv?Y#m~g zQNyRePkD<+1UilfY(l$Ci>;_j3&Pp>qoRo}Vj?C%{HWmG$3QN_Ay^JRu5yfKGwex?r>3!x;Z}+w z%t20xNvDeVxw?Do4Kb1Rsh>fe96884{9?fHSnMEW8(Uxem|(|3nw(mWS&S$K!C#sS zU>?XR4OAhQ)XIKjbq_Eyvu|9_9ae^?$J+pKoO5O5B9%4O-HDEd==bPEB!1+wsoQU) znlz7Kia$E*^pZ!Rt)w{YK_QAp$i@yNE&y?VpkiZUASV1{(JfQs=t$r@T8~Lh_{NK& zp@eukG#r4u*$EA7DQ9rGGi7V@mrxeB_fdALbHg}UvC=lO>sOSCD2~PrCkau{QQEm-@9%GfN!Hb#Cl_RivzVDHaydWfKFXHiWX-}= zqB7Qd$TWVy+rG2fu$Y`J|I(&(-*Mt|in=B6O>Xa|>rim;5>}2`bH=^HA>Hq|qWg!I z3fBW;7KD+MkM^3ETMtDPp$5d9?~C1VCC+i zLO$7X~qE*56!(`2x7XqO}dt^q;(2Yk&yD^$WZ{zZE z(ZGeVn$&h*3Dhdr1K!o*S4BAF^<|^S`Ui2SK2F}|d+Lpyw~C$kCa|BncQvWTcYFBI z&m?v~>@+s?{*oNXRqx2oWxmmF0fVC`G3=e$<0gLo2ia|Csv(ZE5twg2(RS4tSCOPC z?X2)UP3>Cd?GoqpCw4K?DV)>Q5Vm)|Z-%*m!71Ancl;CUj39MvC6s!#XhkL|`RjdY zX8i{77p3Hf5|uhY(nmcGZ`fFczXO`$J7h}C_k5R@R4;isyJgN%gE|>Ti5K7hA=R6(FtqIX;*^@4j1HKy7j@kY7+|k!e%K8rvEZU!nK#KQ(On{ zM6~u&fN3}Tc=ra&wP&PLU+XXfohE7q^8{v`=opRh}XV^xtxjk1JHY% zq5IjJz4`8gho3V!*@POu`7GIyJC>wKK&HVk0bO0S-;6

kZoQcYiP=0`3HpLAysb`FBsFt?8e^!0UN;^sdJ!{$fwaUugXRB6+8!Oi61VQqV zy0iRz*ji_srtg6sTNrCnVEf^EQFH(m$CGw@)Ufbfx> z4l&L!&mDYwKJo|Cr|~4a)1wVT$YRJ4+P=>f2vd;qx}a!9;Ci z3X3R(Q#)scfr`Z>tqjBvmULX$LfPWvB9@04)+gLvYOJ9AiRGB`yPCddmf!D<^CvPA z)J8{Kj+&8nqD%!l6wOAU-i50G6iVuHMz8hVtlg(+CE@sbtBQ3PwIvQjUxb$2pMX_W zNJnjtZ|CXq;ja7U;=^ztbJz}i(ni&(LeEKEBIzWwr<^b zS9|4}cE2zUbv*`opP8CzGpF&|lOZtX<=}3%>oJk=8N4oioY-9Uj6nnvQnEIlt7RVT zQZai`D*2@h@<;*h82;sb-jVhr<%oHu>{`(95Go^e=S3Z%&r6DZZMLtq1oCnHN3nqBU}_P)tV|WekU#&{Ju@N>TJ;6%NgzOsL|ym-kcF6raWE z0jnJHX%4rjMOI>pKcQbSwM&oUez;0#@;eE0E)OQ3;P6|DOIB7g?d;Uzbu6UJR~u*S zH`7T&n=Bpz$rko$+)L3w4N^HR`q_SS4n42ZANAt}i+iw0y5mpo(xKt-;y49KO*^Kr z=$pErQGdy>lHaqnAm&vMO1Je?EJvNW{UGR1S(j*OANy`wAc7@nua9<9r~1~DbnVe* zU?mhn8|AqwZuXULS*5{*haMwvB!`8_Nm{cql^nigFaEMEYIcHZTEzAWUdC5^aAyYj?;7XB7r5l*b-b4>uiJ9N-{=jC4SA$>&^* zsyAa1_`TnvGt7S$i|OTe-y8PY+hV8B7z84~7~Ft!vZ6#hLn0N%{OYv4B(4vjCiI!K zp_t|dl^tmDrk5uVMC)DWCW+uKxZv+z84jvBY3 zaH46zaG4-$^?n-5wNo~VxLj#;7k!g0n7IERtkoYt=*_CDd0W)Ps%Wq zO^gUmyKbs1#jM+of#)@AvcZM*<~?hNJa++@qjModM10|LqMi5Tw?R%xD+lRTF@)$f zqUXnB7%yLhVZ;k~J>x9lETHq1HT5}AE;z3TZe5OEJoi>p7MJVI;+_2H`-6>vs;@NC zJ|}V<^P=v3;4Zh4aQb9?rC8mM&5zC(qOl(askFsDSeVUq4#eiLJ`N%Y%L6v($IM%P z;Cd`=!E-$!$@{Ha$Sr*k-L5#M(QH1#TV%z!`*n&DDsE`(85?yCwqtZ4EG{s;H7bbl zyqq{6{zMh!dV#F?U8TM%bm^fU=_LM+h-zPQn`u_Uep8q#qQ^)^O%}dk6+0tFZV}Q; z$@AqZwy2!Yg_i$p!oXdDs`A~w`cm65dq4ONf9j<@H-Mhws{0ZkR<; zISqm4k_sO7Ak6?yhb|Kdbsr*mm&Z{0j7#s&sd`U6n2smxtj7heH*K}uYfRIQ1>Vlk z=@RnDt@p$z!-SSRz=%Dp((f@-lI=@1S9|S*B|&$Ilt~XJPDdS0gAXsOqwl+;HH;BV zTBlNrDF%y58!QFMp`t&+H-WD?w8xAHuR(PUd)QH^&om@{#%mD>M>vP_)+@X-&)G+m zeIOfdeGn=SikYFwQD=PG7PiccpTZjO99;H==_wyily?k(W^u~C%&sxma`v!hOEBzA zSr6)c(9BV;_(q@+JEz?<%=lur4*w^>_wi5gHj0F1nX%s3#eZp-pH}Cn6|Dt z#*^-_OQt`+#N!B@j^nSD*+WrRlfRb0PjPax zyb2J;zn))!YaC=Q`3}`cJ|@_#QJp18DKvGBMbizOc5qbp-NnLD|I9>R{y2R%mtz>F z{iJP#HP_E>mdkGdk8BTb{ef`M4p@SUtjT)8y-UQSqg_!)(VqQV&O?hc$F{|`JdyHO z+Zd7OZD;H5={Pr0Kf-!@BdHuj-#62)ydB z_#5@A)LC{l=IAv&Ui#ok&6WxZ0Duw-my^>{k(2w^BX#71^o$pA63TtjGy~?Q1#Dz8 zV&TG;Ht0P;#o>7dIap+}72~DBLj~sJ`43fGTu9Au3H&$=Xw5R*-JQwMdI*V>n3)wQ zSdaZxaugyl-`#k5eseh|IGP<%6A&9j(XWft194f_mdPHGp`7MXYe%YnnBJD>!5%wy zC!{z3W+6UuwfHvAa3~e;2)n;dO^9P1e|=Z|@pii=jg^ARLrG>jzMY`{7_&D|{Jxku z&yDf*DT)ec(lC+ZIB+!auv4)ele)-j3k;uIn;8f@xfqP-e`;s;hp-e?aJXe|f$tmj z-d^4+QAS&3z#+BQ3CX>tqTN% zTS|Zo1=PXn&T=ptxU!EcOxH(459(tF6|n?KN#clmi6RLcUc}hpNrAkQ0NEfA z&Z6Aho}Qjuo_t(RuGZWi?#9bo)~Uq#oQ}5NB>4E-<%)1NUDo+z<*LNRmGT z`d=;F^pIOc+&VBfCwEsUOu+-@h+z0Dg(dXg_Rj9E_P^7ygmS~|VGc;B8`3M!-&`uG zsB8V(;+F*0a0ln#R!Fh`hD5-v{v+1kV*AzeJDtA%KbMOiBeY=m3M-= z{|ZkS2tuS{vzc8bMgH_{A*l9k=Y=Xh5Ra1 zB*AYBWHO?1t}qC~$yLwE$zB5VD<=nRCo6;}#1$rEjm#984YGoMX9HyULrd1b z!#!@IQRb}iuiB9UlszS z-5+hp1qQiZasPY4`ct%DI{qKN{v3?|Lk~#mf1LbR`u-!=KXUz73j9~#f2!*rx&A8! z{wwf5)%E{PE}Z||^1vLCKZ87xH#F(p&!&(!Ll_p1mE-|;zuws$MG43jY-eR-HvoW( z;n#uURUqYoY{W#Us4HM@q29-47A%`B3kLwu%T(lL^lX+#=Y1n-VGl5lRz4p#o;D(? zrLjZngVIFVNEp4ad#GM3SIFxH89vID=aH{qy8gmxF`dGn`-+&9?M*ll$1AQLs%hm& zp7{tlmo(rUXW0Dfoy1Aos#O=Uu12khe;}A<>TX7 zu=r8+t(9{OULLnA5}NxECqvPnM${dHJ~*J%61E)~iqpM|F6ST^Wr-D6+5Ym}EJA3V z5yi$Jbc7J2SZZzi0=>~1=Ca?6GL(RzbQ+lPLe)5Qm5y0sy+-p~9dkz!u)Rmwt8js5 z(8!Vgba1K}SI8Q6+?0hnlyJ&lKA_|haXTn{8%}-v%z#|U_ueDTNy9bjW=hm(^K0Cd z{-)YmuS#~HcYI}k)3GWa$;%qLw`0TWsnG!&Dy=w8s{>YogGX}ZnK)`tDx#{s1FuvP zhQXsJi?K-qfkXZTYE!Ep`&8n*m}T)+8Jf8?Cj&+KcyYJaUo`WkJ(-c03i7wGezdRs z;T_y4DD(;c7ys4Lat^^y>Sq|q=-KC(Hz}oV-y#TZ_%`N(@A_*gaR4JnKIMaUJXNUR zt^qP*M>zG9o4k-ISU@$hOISe?=h|t8yUZZ*=`AWdpSezhAT!mtts$C-Bn(q9W zomgiA^X`L`grtEpuAyzB%HG+UX5|0u-Vd)2Vgv1tlgO9(>7H-D({7`$kR|p3qGMP{ z&nj@~zf!6ZWq;j8xGItGenp<^ITb@QtL7x`_eYsan!tz6j_*bVRBt0#XsEp79dTu* zD>tH61?>+XQP_U?kUB?1R-w8W6k~XRTeRMnL*Y1@aH{SdpEX^S`k9#aT6y3xSND3W zng@?0B}1d6=Hz1}>nx@?^|8@_+Z%{?QKEQ{uM7R*a00?TD<9WKn7!>-Yhdc7i$>GD z#lz(XqeofdDl5cMl@!NfdOVC)Yx_@M<)eFE{pwV>4*>DDri8=e1>Fg1& zn2_d{sfxdJ2&M(h(QF(CmUEn)mKd4k!=nvf(YgoFL~69rQ5W$I`|wlKm*s5Ir@oIv zoys_Mx@JlqA1vn>mV8~Nc!5%8OPXp9gSjBUG%;ie{dczjXLdT&Tu7B){ckcmD7u|!exU6ZO+C9bSHIW~nbLLl-`IH8U}$$p_VI15h-BE69I3>s4idr$`K&y6dKSgm zR2neF9m>vyoMYT$d^?3G&Z>ZBreqzmOi4OX3;t;9+~?d8Kl}-j4cce*LQY%DIS%b_ zT%6N*FDk1;o{}&a3s5}_C24Pcr|%%1cHEJ0N0}na7o$WW)x;F;_fGS8hJsCTYB!E} zbaYID87n;MLXVP!4fU1rPwjxuBEf*c<7XQKXC)d`HqepCHOJA+Tnh|o8XO;lg8_No z!Tma0LY3o!XMnmG)NtA!(ax1<2x5}xWnW60Rxrp*ID^Yzt^=KfEUtk z?3EPp-t=??xtAjp^DPG=vB^*C_!Ae?c7%#%S!TZ+e_sgdiC!giM?v6Q-Y;%RqbJLt zH@PAc);&p2T8qWP>~mhArS*3MA8^hs9bm%m&_&5dPs2^ml94MpKt(}AzEbvC(Ek8B CmZc*A literal 0 HcmV?d00001 diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml index ed953b926..7897e5507 100644 --- a/environments/.caas/ui-meta/slurm-infra.yml +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -95,7 +95,7 @@ usage_template: |- services: - name: ood label: Open OnDemand - icon_url: https://github.com/stackhpc/caas-slurm-appliance/raw/main/assets/ood-icon.png + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png - name: monitoring label: Monitoring icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png From 254aa54ddfd5c0c07b37105dab2857b079c4c879 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 17 Nov 2023 14:24:42 +0000 Subject: [PATCH 42/48] merge PR: Add restart delay for Zenith client units caas-slurm-appliance#44 --- ansible/roles/zenith_proxy/templates/client.service.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/zenith_proxy/templates/client.service.j2 b/ansible/roles/zenith_proxy/templates/client.service.j2 index ba4acf0b7..809b19b87 100644 --- a/ansible/roles/zenith_proxy/templates/client.service.j2 +++ b/ansible/roles/zenith_proxy/templates/client.service.j2 @@ -14,6 +14,7 @@ After={{ zenith_proxy_mitm_service_name }}.service Environment=PODMAN_SYSTEMD_UNIT=%n Type=simple Restart=always +RestartSec=5 User={{ zenith_proxy_podman_user }} Group={{ zenith_proxy_podman_user }} ExecStart=/usr/bin/podman run \ From 9573452e553cdba62bd525fb9013c093a0a222f5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 17 Nov 2023 14:27:22 +0000 Subject: [PATCH 43/48] Merge PR: Note rocky user has passwordless sudo in usage message caas-slurm-appliance#43 --- environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml | 2 ++ environments/.caas/ui-meta/slurm-infra.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml index 899e3b439..6146f1896 100644 --- a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -103,6 +103,8 @@ usage_template: |- compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] ``` + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. services: diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml index 7897e5507..250b96469 100644 --- a/environments/.caas/ui-meta/slurm-infra.yml +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -90,6 +90,8 @@ usage_template: |- compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] ``` + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. services: From cdcb7a444b70a09fbddd3a4872ce212e2ffd2a9d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 17 Nov 2023 14:28:08 +0000 Subject: [PATCH 44/48] fix OOD icon for fast volume UI --- environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml index 6146f1896..d210fec47 100644 --- a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -110,7 +110,7 @@ usage_template: |- services: - name: ood label: Open OnDemand - icon_url: https://github.com/stackhpc/caas-slurm-appliance/raw/main/assets/ood-icon.png + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png - name: monitoring label: Monitoring icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png From 85b2feb57f5fd59a406e6df9ac0939374163fb23 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 17 Nov 2023 15:53:18 +0000 Subject: [PATCH 45/48] Merge PR: Bump zenith proxy client and mitm image to v0.1.0 caas-slurm-appliance#45 --- ansible/roles/zenith_proxy/defaults/main.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index 70e92d648..dbb920c58 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,13 +15,13 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" +zenith_proxy_image_tag: '0.1.0' + zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client -zenith_proxy_client_image_tag: main -zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_client_image_tag }}" +zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy -zenith_proxy_mitm_image_tag: main -zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_mitm_image_tag }}" +zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" zenith_proxy_upstream_scheme: http zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" From b78f1e418405d60ccb2ac8b64b0322270520ba1f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Nov 2023 10:20:42 +0000 Subject: [PATCH 46/48] bump ansible-collection-terraform version; can't specify terraform_binary_path anymore --- environments/.caas/inventory/group_vars/openstack.yml | 3 +-- requirements.yml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml index e8a99007c..836078e10 100644 --- a/environments/.caas/inventory/group_vars/openstack.yml +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -10,8 +10,7 @@ terraform_backend_config_defaults: local: {} terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}" -terraform_binary_directory: "{{ playbook_dir }}/bin" -terraform_binary_path: "{{ terraform_binary_directory }}/terraform" +terraform_binary_directory: "{{ appliances_environment_root }}/bin" terraform_project_path: "{{ playbook_dir }}/terraform" terraform_state: "{{ cluster_state | default('present') }}" diff --git a/requirements.yml b/requirements.yml index 1f762e2ad..4cc3b735b 100644 --- a/requirements.yml +++ b/requirements.yml @@ -43,5 +43,5 @@ collections: version: 2.1.0 - name: https://github.com/stackhpc/ansible-collection-terraform type: git - version: 1a8f5af0239de2bfedb37f51e20d973e05699b8a # main @ 20230627 + version: 0.1.0 ... From b01551d226a8df0eb7bf837947aa64f3567c10b4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Nov 2023 11:03:08 +0000 Subject: [PATCH 47/48] remove debug code --- environments/.caas/hooks/pre.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index f0ab51f54..05b0255c8 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,15 +1,5 @@ --- -- name: debug - hosts: - - localhost - - openstack - become: no - gather_facts: no - tasks: - - debug: - msg: "Starting pre-hook" - # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack From 268a3bc2c6f951b6bf65999df42c136b304ba036 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Nov 2023 11:09:58 +0000 Subject: [PATCH 48/48] add caas environment readme --- environments/.caas/README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/environments/.caas/README.md b/environments/.caas/README.md index 972640bae..4a08433b0 100644 --- a/environments/.caas/README.md +++ b/environments/.caas/README.md @@ -1,5 +1,18 @@ # Caas cluster -Default Azimuth Slurm +Environment for default Azimuth Slurm. This is not intended to be manually deployed. -See the main README.md in the repo root for an overview and general install instructions. Any environment-specific instructions should be added here. \ No newline at end of file +Non-standard things for this environment: +- There is no activate script. +- `ansible.cgf` is provided in the repo root, as expected by the caas operator. +- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the + runner project directory: + + azimuth_caas_stackhpc_slurm_appliance_template: + ... + envVars: + ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory + + Ansible then defines `ansible_inventory_sources` which contains absolute paths, and + that is used to derive the `appliances_environment_root` and + `appliances_repository_root`.