Skip to content

Commit

Permalink
Merge pull request #325 from stackhpc/feat/caas
Browse files Browse the repository at this point in the history
Merge caas slurm appliance into slurm appliance
  • Loading branch information
sjpb authored Nov 24, 2023
2 parents 6f31af4 + 268a3bc commit ea3155a
Show file tree
Hide file tree
Showing 51 changed files with 1,595 additions and 43 deletions.
19 changes: 19 additions & 0 deletions ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Only used for Azimuth running the caas environment
[defaults]
any_errors_fatal = True
gathering = smart
forks = 30
host_key_checking = False
remote_tmp = /tmp
collections_path = ansible/collections
roles_path = ansible/roles
filter_plugins = ansible/filter_plugins
callbacks_enabled = ansible.posix.profile_tasks

[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
pipelining = True
# This is important because we are using one of the hosts in the play as a jump host
# This ensures that if the proxy connection is interrupted, rendering the other hosts
# unreachable, the connection is retried instead of failing the entire play
retries = 10
15 changes: 13 additions & 2 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ roles/*
!roles/firewalld/**
!roles/etc_hosts/
!roles/etc_hosts/**
!roles/cloud_init/
!roles/cloud_init/**
!roles/mysql/
!roles/mysql/**
!roles/systemd/
Expand All @@ -44,3 +42,16 @@ roles/*
!roles/resolv_conf/**
!roles/cve-2023-41914
!roles/cve-2023-41914/**
!roles/cluster_infra/
!roles/cluster_infra/**
!roles/image_build_infra/
!roles/image_build_infra/**
!roles/persist_openhpc_secrets/
!roles/persist_openhpc_secrets/**
!roles/zenith_proxy/
!roles/zenith_proxy/**
!roles/image_build/
!roles/image_build/**
!roles/persist_hostkeys/
!roles/persist_hostkeys/**
!roles/requirements.yml
9 changes: 0 additions & 9 deletions ansible/adhoc/template-cloud-init.yml

This file was deleted.

1 change: 1 addition & 0 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
name: fail2ban

- name: Setup podman
gather_facts: false
hosts: podman
tags: podman
tasks:
Expand Down
2 changes: 1 addition & 1 deletion ansible/noop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

- hosts: localhost
gather_facts: false
tasks: []
tasks: []
7 changes: 7 additions & 0 deletions ansible/roles/cluster_infra/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cluster_deploy_ssh_keys_extra: []

# List of hw_scsi_models that result in block devices presenting as /dev/sdX
# rather than /dev/vdX
scsi_models:
# Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties]
- virtio-scsi
104 changes: 104 additions & 0 deletions ansible/roles/cluster_infra/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
- debug:
msg: |
terraform_backend_type: {{ terraform_backend_type }}
terraform_state: {{ terraform_state }}
cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }}

# We need to convert the floating IP id to an address for Terraform
# if we we have cluster_floating_ip, otherwise assume that we're
# assigning the FIP in Terraform and that it will be available in
# outputs.cluster_gateway_ip.
- block:
- name: Look up floating IP
include_role:
name: stackhpc.terraform.infra
tasks_from: lookup_floating_ip
vars:
os_floating_ip_id: "{{ cluster_floating_ip }}"

- name: Set floating IP address fact
set_fact:
cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}"
when: cluster_floating_ip is defined

- name: Install Terraform binary
include_role:
name: stackhpc.terraform.install

- name: Make Terraform project directory
file:
path: "{{ terraform_project_path }}"
state: directory

- name: Write backend configuration
copy:
content: |
terraform {
backend "{{ terraform_backend_type }}" { }
}
dest: "{{ terraform_project_path }}/backend.tf"

# Patching in this appliance is implemented as a switch to a new base image
# So unless explicitly patching, we want to use the same image as last time
# To do this, we query the previous Terraform state before updating
- block:
- name: Get previous Terraform state
stackhpc.terraform.terraform_output:
binary_path: "{{ terraform_binary_path }}"
project_path: "{{ terraform_project_path }}"
backend_config: "{{ terraform_backend_config }}"
register: cluster_infra_terraform_output

- name: Extract image from Terraform state
set_fact:
cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}"
when: '"cluster_image" in cluster_infra_terraform_output.outputs'
when:
- terraform_state == "present"
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages

- name: Detect volume device prefix from image metadata
block:
- name: Get image metadata from OpenStack API
openstack.cloud.image_info:
image: "{{ cluster_previous_image | default(cluster_image) }}"
register: cluster_image_info
- name: Check only single image found
assert:
that: cluster_image_info.images | length == 1
fail_msg: "Multiple images found for 'cluster_image' {{ cluster_image }}"
- name: Set volume_device_prefix fact
set_fact:
block_device_prefix: >-
{{
'sd' if (cluster_image_info.images | first).hw_scsi_model is defined and
(cluster_image_info.images | first).hw_scsi_model in scsi_models
else 'vd'
}}
# Only run when block_device_prefix isn't set as an extravar
when:
- block_device_prefix is not defined
- cluster_image is defined

- name: Template Terraform files into project directory
template:
src: >-
{{
"{}{}.j2".format(
(
cluster_terraform_template_dir ~ "/"
if cluster_terraform_template_dir is defined
else ""
),
item
)
}}
dest: "{{ terraform_project_path }}/{{ item }}"
loop:
- outputs.tf
- providers.tf
- resources.tf

- name: Provision infrastructure
include_role:
name: stackhpc.terraform.infra
53 changes: 53 additions & 0 deletions ansible/roles/cluster_infra/templates/outputs.tf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
output "cluster_gateway_ip" {
description = "The IP address of the gateway used to contact the cluster nodes"
value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip
}

{% if cluster_ssh_private_key_file is not defined %}
output "cluster_ssh_private_key" {
description = "The private component of the keypair generated on cluster provision"
value = openstack_compute_keypair_v2.cluster_keypair.private_key
sensitive = true
}
{% endif %}

output "cluster_nodes" {
description = "A list of the nodes in the cluster from which an Ansible inventory will be populated"
value = concat(
[
{
name = openstack_compute_instance_v2.login.name
ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4
groups = ["login", "{{ cluster_name }}_login"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
},
{
name = openstack_compute_instance_v2.control.name
ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
groups = ["control", "{{ cluster_name }}_control"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
}
],
{% for partition in openhpc_slurm_partitions %}
[
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
name = compute.name
ip = compute.network[0].fixed_ip_v4
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
}
]{{ ',' if not loop.last }}
{% endfor %}
)
}

output "cluster_image" {
description = "The id of the image used to build the cluster nodes"
value = "{{ cluster_previous_image | default(cluster_image) }}"
}
10 changes: 10 additions & 0 deletions ansible/roles/cluster_infra/templates/providers.tf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
terraform {
required_version = ">= 0.14"

# We need the OpenStack provider
required_providers {
openstack = {
source = "terraform-provider-openstack/openstack"
}
}
}
Loading

0 comments on commit ea3155a

Please sign in to comment.