Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support manila fileshares (cephfs) #344

Merged
merged 22 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
93edb8d
add support for manila to common environment
sjpb Dec 14, 2023
287d917
use manila share for /scratch in stackhpc env w/ tests
sjpb Dec 14, 2023
ec8e8a5
make home volume optional in skeleton TF
sjpb Jan 10, 2024
254ce87
remove share creation from CI TF
sjpb Jan 10, 2024
93ddc66
add manila UI for caas
sjpb Jan 10, 2024
db1cbdc
support manila- or nfs/volume- based home dirs in caas
sjpb Jan 10, 2024
240378c
remove manila config from UI
sjpb Jan 10, 2024
5338ca6
add optional platform-lifecycle manila share for homedirs for caas
sjpb Jan 10, 2024
9e30b06
add home and project manila config for caas
sjpb Jan 10, 2024
79b3f8d
tweak home volume size UI description to account for shares
sjpb Jan 10, 2024
06daaa4
fix caas manila config typo
sjpb Jan 10, 2024
eb05586
tidy PR diff
sjpb Jan 16, 2024
e659526
bump fatimage to include manila client
sjpb Jan 16, 2024
0da0ab4
Revert commit "tweak home volume size UI description to account for s…
sjpb Jan 16, 2024
3db521b
add manila UI for caas
sjpb Jan 16, 2024
6b3493c
add defaults for new caas manila extravars, where possible
sjpb Jan 17, 2024
e4c650a
make cluster_home_manila_share_type optional for when default share t…
sjpb Jan 17, 2024
505658b
address review comments
sjpb Jan 18, 2024
2fdb68c
bump manila requirement after role release
sjpb Jan 23, 2024
61252d0
default usage of home manila share to match project share for caas
sjpb Jan 30, 2024
3095ae4
Merge branch 'main' into feat/manila
sjpb Jan 30, 2024
8b75cf9
remove caas manila-specific ui-meta
sjpb Jan 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,16 @@
tasks_from: client-install.yml
when: "'freeipa_client' in group_names"

# - import_playbook: filesystems.yml
- name: nfs
# - import_playbook: filesystems.yml:
- name: Install nfs packages
dnf:
name: nfs-utils
when: "'nfs' in group_names"
- name: Install Manila client packages
include_role:
name: stackhpc.os-manila-mount
tasks_from: install.yml
when: "'manila' in group_names"

- import_playbook: extras.yml

Expand Down
8 changes: 8 additions & 0 deletions ansible/filesystems.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@
tasks:
- include_role:
name: stackhpc.nfs

- name: Setup Manila share mounts
hosts: manila
become: true
tags: manila
tasks:
- include_role:
name: stackhpc.os-manila-mount
23 changes: 22 additions & 1 deletion ansible/roles/cluster_infra/templates/resources.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,22 @@ resource "openstack_blockstorage_volume_v3" "state" {
size = "{{ state_volume_size }}"
}

{% if cluster_home_manila_share | bool %}
resource "openstack_sharedfilesystem_share_v2" "home" {
name = "{{ cluster_name }}-home"
description = "Home for cluster"
share_proto = "CEPHFS"
share_type = {{ '"' + cluster_home_manila_share_type + '"' | default('null') }}
size = "{{ home_volume_size }}"
}

resource "openstack_sharedfilesystem_share_access_v2" "home" {
share_id = openstack_sharedfilesystem_share_v2.home.id
access_type = "cephx"
access_to = "cluster_{{ cluster_id }}"
access_level = "rw"
}
{% else %}
resource "openstack_blockstorage_volume_v3" "home" {
name = "{{ cluster_name }}-home"
description = "Home for control node"
Expand All @@ -89,6 +105,7 @@ resource "openstack_blockstorage_volume_v3" "home" {
{% endif %}
{% endif %}
}
{% endif %}

######
###### Cluster network
Expand Down Expand Up @@ -334,13 +351,15 @@ resource "openstack_compute_instance_v2" "control" {
uuid = openstack_blockstorage_volume_v3.state.id
}

{% if not cluster_home_manila_share | bool %}
# home volume:
block_device {
destination_type = "volume"
source_type = "volume"
boot_index = -1
uuid = openstack_blockstorage_volume_v3.home.id
}
{% endif %}

# Use cloud-init to a) inject SSH keys b) configure volumes
user_data = <<-EOF
Expand All @@ -359,12 +378,14 @@ resource "openstack_compute_instance_v2" "control" {
- {{ ssh_key }}
{%- endfor %}
bootcmd:
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
%{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]}
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
%{endfor}
mounts:
- [LABEL=state, {{ appliances_state_dir }}, auto]
{% if not cluster_home_manila_share | bool %}
- [LABEL=home, /exports/home, auto]
{% endif %}
EOF
}

Expand Down
4 changes: 4 additions & 0 deletions environments/.caas/inventory/extra_groups
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ cluster
[zenith:children]
grafana
openondemand

[manila:children]
login
compute
4 changes: 4 additions & 0 deletions environments/.caas/inventory/group_vars/all/cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_ga
openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}"

appliances_state_dir: /var/lib/state

# Defaults for caas-provided extravars:
cluster_project_manila_share: false
sjpb marked this conversation as resolved.
Show resolved Hide resolved
cluster_home_manila_share: "{{ cluster_project_manila_share }}"
16 changes: 16 additions & 0 deletions environments/.caas/inventory/group_vars/all/manila.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
caas_manila_home:
share_name: "{{ cluster_name }}-home"
mount_path: /home
mount_user: root
mount_group: root
mount_mode: u=rwX,go=rX

cluster_project_manila_share_name: azimuth-project-share
caas_manila_project:
share_name: "{{ cluster_project_manila_share_name }}"
mount_path: /project
mount_user: root
mount_group: root
mount_mode: ugo=rwX

os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}"
18 changes: 11 additions & 7 deletions environments/.caas/inventory/group_vars/all/nfs.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
nfs_server: "{{ nfs_server_default }}"

nfs_configurations:
- comment: Export /exports/home from Slurm control node as /home
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
nfs_export: "/exports/home" # assumes skeleton TF is being used
nfs_client_mnt_point: "/home"
caas_nfs_ood_state:
- comment: Export /var/lib/state from Slurm control node to OOD
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['openondemand'] }}"
nfs_export: "{{ appliances_state_dir }}"
nfs_client_mnt_point: "{{ appliances_state_dir }}"
nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service"

caas_nfs_home:
- comment: Export /exports/home from Slurm control node as /home
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
nfs_export: "/exports/home" # assumes skeleton TF is being used
nfs_client_mnt_point: "/home"

nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}"
105 changes: 105 additions & 0 deletions environments/.caas/ui-meta/slurm-infra-manila.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: "slurm-manila-preview"
label: "Slurm: Manila Integration Preview"
description: >-
Batch cluster running the Slurm workload manager, the Open
OnDemand web interface, and custom monitoring.

Has /project and /home from CephFS/Manila.
logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png

parameters:
- name: cluster_floating_ip
label: External IP
description: The external IP to use for the login node.
kind: cloud.ip
immutable: true

- name: compute_count
label: Compute node count
description: The number of compute nodes in the cluster.
kind: integer
options:
min: 1
default: 3

- name: compute_flavor
label: Compute node size
description: The size to use for the compute node.
kind: "cloud.size"
immutable: true
options:
min_ram: 2048
min_disk: 20

- name: home_volume_size
label: Home filesystem size (GB)
description: The size of the cloud volume or share to use for home directories
kind: integer
sjpb marked this conversation as resolved.
Show resolved Hide resolved
immutable: true
options:
min: 10
default: 100

- name: metrics_db_maximum_size
label: Metrics database size (GB)
description: |
The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be
discarded to ensure that the database does not grow larger than this size.

**A cloud volume of this size +10GB will be created to hold and persist the metrics
database and important Slurm files.**
kind: integer
immutable: true
options:
min: 10
default: 10

- name: cluster_run_validation
label: Post-configuration validation
description: >-
If selected, post-configuration jobs will be executed to validate the core functionality
of the cluster when it is re-configured.
kind: boolean
required: false
default: true
options:
checkboxLabel: Run post-configuration validation?
sjpb marked this conversation as resolved.
Show resolved Hide resolved

usage_template: |-
# Accessing the cluster using Open OnDemand

[Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical
environments such as [Jupyter Notebooks](https://jupyter.org/).

{% if cluster.outputs.openondemand_url %}
The Open OnDemand portal for this cluster is available at
[{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}).

Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted.
{% else %}
The Open OnDemand portal for this cluster can be accessed from the services list.
{% endif %}

# Accessing the cluster using SSH

The cluster can be accessed over SSH via the external IP. The SSH public key of the user that
deployed the cluster is injected into the `azimuth` user:

```
$ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }}
[azimuth@{{ cluster.name }}-login-0 ~]$ sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}]
```

The `rocky` user can be accessed the same way and has passwordless `sudo` enabled.

SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`.

services:
- name: ood
label: Open OnDemand
icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png
- name: monitoring
label: Monitoring
icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png
5 changes: 5 additions & 0 deletions environments/.stackhpc/inventory/extra_groups
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,8 @@ cluster
# [resolv_conf:children]
# freeipa_client
# --- end of FreeIPA example ---

[manila:children]
# Allows demo; also installs manila client in fat image
login
compute
11 changes: 10 additions & 1 deletion environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# This terraform configuration uses the "skeleton" terraform, so that is checked by CI.

terraform {
required_version = ">= 0.14"
required_providers {
openstack = {
source = "terraform-provider-openstack/openstack"
}
}
}

variable "environment_root" {
type = string
description = "Path to environment root, automatically set by activate script"
Expand All @@ -13,7 +22,7 @@ variable "cluster_name" {
variable "cluster_image" {
description = "single image for all cluster nodes - a convenience for CI"
type = string
default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351
default = "openhpc-240116-1604-b3563a08" # https://github.com/stackhpc/ansible-slurm-appliance/pull/344
# default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
}

Expand Down
13 changes: 13 additions & 0 deletions environments/common/inventory/group_vars/all/manila.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Default configuration for manila file shares, see
# https://github.com/stackhpc/ansible-role-os-manila-mount
# for all variable definitions, and override in your environment.

os_manila_mount_shares: []
# - share_name:
# share_user:
# mount_path:
# mount_user:
# mount_group:
# mount_mode:

# os_manila_mount_ceph_version: nautilus # role default for RockyLinux 8
3 changes: 3 additions & 0 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@ openhpc

[proxy]
# Hosts to configure http/s proxies - see ansible/roles/proxy/README.md

[manila]
# Hosts to configure for manila fileshares
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
locals {
user_data_path = "${var.environment_root}/cloud_init/${var.cluster_name}-%s.userdata.yml"
control_volumes = concat([openstack_blockstorage_volume_v3.state], var.home_volume_size > 0 ? [openstack_blockstorage_volume_v3.home][0] : [])
}


Expand Down Expand Up @@ -100,20 +101,14 @@ resource "openstack_compute_instance_v2" "control" {
delete_on_termination = true
}

# state volume:
block_device {
destination_type = "volume"
source_type = "volume"
boot_index = -1
uuid = openstack_blockstorage_volume_v3.state.id
}

# home volume:
block_device {
dynamic "block_device" {
for_each = local.control_volumes
content {
destination_type = "volume"
source_type = "volume"
boot_index = -1
uuid = openstack_blockstorage_volume_v3.home.id
uuid = block_device.value.id # actually openstack_blockstorage_volume_v3 id
}
}

network {
Expand All @@ -130,13 +125,15 @@ resource "openstack_compute_instance_v2" "control" {
fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix}

bootcmd:
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
%{for volume in local.control_volumes}
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
%{endfor}

mounts:
- [LABEL=state, ${var.state_dir}]
%{if var.home_volume_size > 0}
- [LABEL=home, /exports/home]
%{endif}
EOF

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ variable "state_volume_size" {
variable "home_volume_size" {
type = number
description = "Size of state volume on control node, in GB"
default = 100 # GB
default = 100 # GB, 0 means no home volume
}

variable "vnic_type" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ resource "openstack_blockstorage_volume_v3" "state" {
}

resource "openstack_blockstorage_volume_v3" "home" {

count = var.home_volume_size > 0 ? 1 : 0

name = "${var.cluster_name}-home"
description = "Home for control node" # first word used to label filesystem
size = var.home_volume_size
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ansible==6.0.0
openstacksdk
python-openstackclient
python-manilaclient
jmespath
passlib[bcrypt]==1.7.4
cookiecutter
Expand Down
Loading
Loading