From 9cf07d3199ef386c60afcf6af447fab35dc5dca0 Mon Sep 17 00:00:00 2001 From: Marian Krcmarik Date: Mon, 9 Dec 2024 13:16:21 +0100 Subject: [PATCH] DNM ci_dcn_site: Add scaling down of a DCN site --- roles/ci_dcn_site/README.md | 1 + roles/ci_dcn_site/defaults/main.yml | 1 + roles/ci_dcn_site/tasks/deploy_site.yaml | 34 +++ roles/ci_dcn_site/tasks/main.yml | 23 +- roles/ci_dcn_site/tasks/scaledown_site.yaml | 268 ++++++++++++++++++ .../templates/service-values.yaml.j2 | 24 +- .../openshift_adm/tasks/wait_for_cluster.yml | 12 +- 7 files changed, 339 insertions(+), 24 deletions(-) create mode 100644 roles/ci_dcn_site/tasks/deploy_site.yaml create mode 100644 roles/ci_dcn_site/tasks/scaledown_site.yaml diff --git a/roles/ci_dcn_site/README.md b/roles/ci_dcn_site/README.md index 562474432c..a6ac97c9a3 100644 --- a/roles/ci_dcn_site/README.md +++ b/roles/ci_dcn_site/README.md @@ -12,6 +12,7 @@ with a collocated Ceph cluster. ## Parameters * `_az`: The name of the availability zone for the AZ, e.g. `az1` +* `_az_to_scaledown`: The name of the availability zone for the deployed AZ to be scale-downed. * `_group_name`: The name of the group of nodes to be deployed, e.g. `dcn1-computes` * `_subnet`: The name of the subnet the DCN site will use, e.g. `subnet2` * `_subnet_network_range`: The range of the subnet the DCN site will use, e.g. `192.168.133.0/24` diff --git a/roles/ci_dcn_site/defaults/main.yml b/roles/ci_dcn_site/defaults/main.yml index ea30f552b7..857c256f9a 100644 --- a/roles/ci_dcn_site/defaults/main.yml +++ b/roles/ci_dcn_site/defaults/main.yml @@ -26,3 +26,4 @@ ci_dcn_site_search_storagemgmt_network_names: - "storagemgmtdcn1" - "storagemgmtdcn2" cifmw_ci_dcn_site_enable_network_az: false +_az_to_scaledown: "" diff --git a/roles/ci_dcn_site/tasks/deploy_site.yaml b/roles/ci_dcn_site/tasks/deploy_site.yaml new file mode 100644 index 0000000000..994057febb --- /dev/null +++ b/roles/ci_dcn_site/tasks/deploy_site.yaml @@ -0,0 +1,34 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Render and apply pre-ceph CRs in DCN context + ansible.builtin.include_tasks: pre-ceph.yml + +- name: Deploy Ceph in DCN context + ansible.builtin.include_tasks: ceph.yml + +- name: Render and apply post-ceph CRs in DCN context + ansible.builtin.include_tasks: post-ceph.yml + +- name: Run Nova cell discovery for new DCN hosts + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: nova-cell0-conductor-0 + command: nova-manage cell_v2 discover_hosts --verbose + +- name: Create new AZ and add new hosts to it + ansible.builtin.include_tasks: az.yml \ No newline at end of file diff --git a/roles/ci_dcn_site/tasks/main.yml b/roles/ci_dcn_site/tasks/main.yml index d449ba9838..a217678efc 100644 --- a/roles/ci_dcn_site/tasks/main.yml +++ b/roles/ci_dcn_site/tasks/main.yml @@ -22,21 +22,10 @@ - name: Set Network related facts ansible.builtin.include_tasks: set_network_facts.yml -- name: Render and apply pre-ceph CRs in DCN context - ansible.builtin.include_tasks: pre-ceph.yml +- name: Deploy a DCN site + ansible.builtin.include_tasks: deploy_site.yaml + when: _az_to_scaledown == "" -- name: Deploy Ceph in DCN context - ansible.builtin.include_tasks: ceph.yml - -- name: Render and apply post-ceph CRs in DCN context - ansible.builtin.include_tasks: post-ceph.yml - -- name: Run Nova cell discovery for new DCN hosts - kubernetes.core.k8s_exec: - api_key: "{{ _auth_results.openshift_auth.api_key }}" - namespace: openstack - pod: nova-cell0-conductor-0 - command: nova-manage cell_v2 discover_hosts --verbose - -- name: Create new AZ and add new hosts to it - ansible.builtin.include_tasks: az.yml +- name: Scale a DCN site down + ansible.builtin.include_tasks: scaledown_site.yaml + when: _az_to_scaledown is defined and _az_to_scaledown != "" diff --git a/roles/ci_dcn_site/tasks/scaledown_site.yaml b/roles/ci_dcn_site/tasks/scaledown_site.yaml new file mode 100644 index 0000000000..9bba31876e --- /dev/null +++ b/roles/ci_dcn_site/tasks/scaledown_site.yaml @@ -0,0 +1,268 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Get compute nodes from the host aggregate + register: az_hosts + ignore_errors: true + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate show {{ _az_to_scaledown }} -c hosts -f value + +- name: Convert az_hosts string to list and remove extra text + ansible.builtin.set_fact: + az_hosts_list: > + {{ az_hosts.stdout + | default([]) + | from_yaml + | list }} + when: not az_hosts.failed + +- name: Delete the compute nodes from the aggregate + loop: "{{ az_hosts_list }}" + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate remove host {{ _az_to_scaledown }} {{ item }} + when: not az_hosts.failed + +- name: Delete the host aggregate + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate delete {{ _az_to_scaledown }} + when: not az_hosts.failed + +- name: Find all ceph variable files + register: _ceph_vars_files + ansible.builtin.find: + paths: "/tmp" + patterns: "ceph_client_az*.yml" + recurse: false + +- name: Load all ceph vars from files + loop: "{{ _ceph_vars_files.files | map(attribute='path') | list }}" + register: _ceph_vars + ansible.builtin.include_vars: + file: "{{ item }}" + +- name: Combine ceph variables into a list of dictionaries + loop: "{{ _ceph_vars.results }}" + ansible.builtin.set_fact: + _ceph_vars_list: "{{ _ceph_vars_list | union([item.ansible_facts]) }}" + +- name: Define _all_azs list for all Ceph backends + loop: "{{ _ceph_vars_list }}" + ansible.builtin.set_fact: + _all_azs: "{{ _all_azs | default([]) + [ item.cifmw_ceph_client_cluster ] }}" + +- name: The map for az0 contains all AZ backends + ansible.builtin.set_fact: + ci_dcn_site_glance_map: "{{ { 'az0': _all_azs } }}" + +- name: The map for AZs other than az0 contains backends for az0 and itself + loop: "{{ _all_azs }}" + when: item != "az0" + ansible.builtin.set_fact: + ci_dcn_site_glance_map: "{{ ci_dcn_site_glance_map | combine( { item: ['az0', item ] } ) }}" + +- name: Render the scale-downed control plane service-values.yaml + ansible.builtin.template: + mode: "0644" + backup: true + src: "templates/service-values.yaml.j2" + dest: "{{ ci_dcn_site_arch_path }}/control-plane/scaledown/service-values.yaml" + +- name: Kustomize scale-downed OpenStackControlPlane + ansible.builtin.set_fact: + scaledown_controlplane_cr: >- + {{ lookup('kubernetes.core.kustomize', + dir=ci_dcn_site_arch_path + '/control-plane/scaledown') }} + +- name: Save the post-ceph NodeSet CR + ansible.builtin.copy: + mode: "0644" + dest: "{{ ci_dcn_site_arch_path }}/control-plane-scale-downed_{{ _az_to_scaledown }}.yaml" + content: "{{ scaledown_controlplane_cr }}" + backup: true + +- name: Apply post-ceph NodeSet CR + register: result + retries: 5 + delay: 10 + until: result is not failed + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + state: present + apply: true + src: "{{ ci_dcn_site_arch_path }}/control-plane-scale-downed_{{ _az_to_scaledown }}.yaml" + +- name: Delete rabbitmqcluster + vars: + az_to_cell_map: + az0: cell1 + az1: cell2 + az2: cell3 + ansible.builtin.shell: | + oc delete rabbitmqclusters rabbitmq-{{ az_to_cell_map[_az_to_scaledown] }} + +- name: Delete the cinder-volume service + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: cinder-scheduler-0 + command: >- + cinder-manage service remove cinder-volume cinder-volume-{{ _az_to_scaledown }}-0@ceph + +- name: Fetch ceph-conf-files secret + kubernetes.core.k8s_info: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + kind: Secret + name: ceph-conf-files + namespace: openstack + register: secret_info + +- name: Save secret data to files + ansible.builtin.copy: + content: "{{ secret_info.resources[0].data[key] | b64decode }}" + dest: "/tmp/{{ key }}" + loop: "{{ secret_info.resources[0].data.keys() }}" + loop_control: + loop_var: key + +- name: Delete the Ceph cluster's secrets of removed cluster and default site cluster + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + kind: Secret + name: "/tmp/{{ item }}" + namespace: openstack + state: absent + loop: + - "ceph-conf-files-{{ _az_to_scaledown }}" + - "ceph-conf-files" + +- name: Find all ceph variable files + register: all_ceph_conf_files + ansible.builtin.find: + paths: "/tmp" + patterns: "az*.c*" + recurse: false + +- name: Set fact for base64-encoded file data of ceph-conf-files Secret + ansible.builtin.set_fact: + file_data: "{{ file_data | default({}) | combine({ item | basename: (lookup('file', item) | b64encode) }) }}" + loop: "{{ all_ceph_conf_files.files | map(attribute='path') | reject('search', _az_to_scaledown) | list }}" + +- name: Recreate the secret while omitting deleted ceph cluster + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + state: present + definition: + kind: Secret + metadata: + name: ceph-conf-files + namespace: openstack + type: Opaque + data: "{{ file_data }}" + +- name: Get compute nodes from the scale-downed AZ + register: az_compute_hosts + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + sh -c "openstack compute service list -c Host -c Zone -f value | grep {{ _az_to_scaledown }} | awk '{print $1}'" + +- name: Disable the compute service on scale-downed compute nodes + loop: "{{ az_compute_hosts.stdout_lines }}" + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack compute service set {{ item }} nova-compute --disable + +- name: Stop the ovn_controller service + ansible.builtin.service: + name: edpm_ovn_controller + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Stop the ovn metadata agent service + ansible.builtin.service: + name: edpm_ovn_metadata_agent + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Stop the nova-compute service + ansible.builtin.service: + name: edpm_nova_compute + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Remove the systemd unit files of the ovn and nova-compute containers + ansible.builtin.shell: | + rm -f /etc/systemd/system/edpm_ovn_controller + rm -f /etc/systemd/system/edpm_ovn_metadata_agent + rm -f /etc/systemd/system/edpm_nova_compute + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Delete the network agents on scale-downed compute nodes + loop: "{{ az_compute_hosts.stdout_lines }}" + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + sh -c "openstack network agent list --host {{ item }} -c ID -f value | xargs openstack network agent delete" + +- name: Fetch OpenStackDataPlaneNodeSet resource + kubernetes.core.k8s_info: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + api_version: dataplane.openstack.org/v1beta1 + kind: OpenStackDataPlaneNodeSet + name: "{{ _group_name }}-edpm" + namespace: openstack + register: osdpns_info + +- name: Delete OpenStackDataPlaneNodeSet + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + api_version: dataplane.openstack.org/v1beta1 + state: absent + kind: OpenStackDataPlaneNodeSet + name: "{{ _group_name }}-edpm" + namespace: openstack + +- name: Delete each Secret which contains TLS certificate for the NodeSet nodes + ansible.builtin.command: + cmd: oc delete Secret {{ item }} + loop: "{{ osdpns_info.resources[0].status.secretHashes.keys() | select('search', 'cert') | list }}" \ No newline at end of file diff --git a/roles/ci_dcn_site/templates/service-values.yaml.j2 b/roles/ci_dcn_site/templates/service-values.yaml.j2 index 4c4d684c37..2147beb6c6 100644 --- a/roles/ci_dcn_site/templates/service-values.yaml.j2 +++ b/roles/ci_dcn_site/templates/service-values.yaml.j2 @@ -26,6 +26,7 @@ data: storage_availability_zone = az0 cinderVolumes: {% for _ceph in _ceph_vars_list %} +{% if _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} {{ _ceph.cifmw_ceph_client_cluster }}: customServiceConfig: | [DEFAULT] @@ -41,6 +42,7 @@ data: rbd_secret_uuid = {{ _ceph.cifmw_ceph_client_fsid }} rbd_cluster_name = {{ _ceph.cifmw_ceph_client_cluster }} backend_availability_zone = {{ _ceph.cifmw_ceph_client_cluster }} +{% endif %} {% endfor %} galera: templates: @@ -50,7 +52,11 @@ data: storageRequest: 5G {% for index in range(1, _all_azs | length + 1) %} openstack-cell{{ index }}: +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 1 +{% else %} + replicas: 0 +{% endif %} secret: osp-secret storageRequest: 5G {% endfor %} @@ -58,6 +64,7 @@ data: keystoneEndpoint: az0 glanceAPIs: {% for _ceph in _ceph_vars_list %} +{% if _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} {{ _ceph.cifmw_ceph_client_cluster }}: customServiceConfig: | [DEFAULT] @@ -66,12 +73,14 @@ data: [glance_store] default_backend = {{ _ceph.cifmw_ceph_client_cluster }} {% for _ceph_az in ci_dcn_site_glance_map[_ceph.cifmw_ceph_client_cluster] %} +{% if _ceph_az != _az_to_scaledown %} [{{ _ceph_az }}] rbd_store_ceph_conf = /etc/ceph/{{ _ceph_az }}.conf store_description = "{{ _ceph_az }} RBD backend" rbd_store_pool = images rbd_store_user = openstack rbd_thin_provisioning = True +{% endif %} {% endfor %} networkAttachments: - storage @@ -92,6 +101,7 @@ data: replicas: 1 type: edge {% endif %} +{% endif %} {% endfor %} manila: enabled: false @@ -157,7 +167,11 @@ data: cellDatabaseAccount: nova-cell{{ index }} cellMessageBusInstance: rabbitmq-cell{{ index }} conductorServiceTemplate: +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 1 +{% else %} + replicas: 0 +{% endif %} hasAPIAccess: true metadataServiceTemplate: enabled: true @@ -170,7 +184,11 @@ data: metallb.universe.tf/loadBalancerIPs: 172.17.0.8{{ index }} spec: type: LoadBalancer +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 3 +{% else %} + replicas: 0 +{% endif %} {% endfor %} rabbitmq: templates: @@ -194,7 +212,11 @@ data: metallb.universe.tf/loadBalancerIPs: 172.17.0.8{{ 5 + index }} spec: type: LoadBalancer +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 3 +{% else %} + replicas: 0 +{% endif %} {% endfor %} extraMounts: - name: v1 @@ -215,7 +237,7 @@ data: mountPath: /etc/ceph readOnly: true {% for _ceph in _ceph_vars_list %} -{% if _ceph.cifmw_ceph_client_cluster != 'az0' %} +{% if _ceph.cifmw_ceph_client_cluster != 'az0' and _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} - propagation: - {{ _ceph.cifmw_ceph_client_cluster }} extraVolType: Ceph diff --git a/roles/openshift_adm/tasks/wait_for_cluster.yml b/roles/openshift_adm/tasks/wait_for_cluster.yml index 5d3c92be28..4a780bdbd3 100644 --- a/roles/openshift_adm/tasks/wait_for_cluster.yml +++ b/roles/openshift_adm/tasks/wait_for_cluster.yml @@ -46,12 +46,6 @@ retries: "{{ cifmw_openshift_adm_retry_count }}" delay: 5 -- name: Check for pending certificate approval. - when: - - _openshift_adm_check_cert_approve | default(false) | bool - approve_csr: - k8s_config: "{{ cifmw_openshift_kubeconfig }}" - - name: Wait until the OpenShift cluster is stable. environment: KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" @@ -71,3 +65,9 @@ until: _oc_login_result.k8s_auth is defined retries: "{{ cifmw_openshift_adm_retry_count }}" delay: 2 + +- name: Check for pending certificate approval. + when: + - _openshift_adm_check_cert_approve | default(false) | bool + approve_csr: + k8s_config: "{{ cifmw_openshift_kubeconfig }}"