From 259017af88372d74606d0574fd34c55cfaf3ba67 Mon Sep 17 00:00:00 2001 From: David Hageman Date: Sat, 2 Mar 2024 15:54:42 -0600 Subject: [PATCH] Add -ness checks and refactor migrations --- config/crd/bases/awx.ansible.com_awxs.yaml | 80 +++++++++++++++++++ config/rbac/role.yaml | 11 +++ .../container-probes.md | 40 ++++++++++ roles/installer/tasks/initialize_django.yml | 32 ++++---- roles/installer/tasks/install.yml | 46 +---------- roles/installer/tasks/migrate_schema.yml | 57 +++++++++++++ .../tasks/resources_configuration.yml | 26 +++--- roles/installer/tasks/update_status.yml | 8 +- .../templates/deployments/task.yaml.j2 | 47 ++++++++++- .../templates/deployments/web.yaml.j2 | 25 ++++++ .../templates/jobs/migration.yaml.j2 | 57 +++++++++++++ 11 files changed, 352 insertions(+), 77 deletions(-) create mode 100644 docs/user-guide/advanced-configuration/container-probes.md create mode 100644 roles/installer/tasks/migrate_schema.yml create mode 100644 roles/installer/templates/jobs/migration.yaml.j2 diff --git a/config/crd/bases/awx.ansible.com_awxs.yaml b/config/crd/bases/awx.ansible.com_awxs.yaml index 55628abfe..66306aec7 100644 --- a/config/crd/bases/awx.ansible.com_awxs.yaml +++ b/config/crd/bases/awx.ansible.com_awxs.yaml @@ -1571,6 +1571,86 @@ spec: description: Number of task instance replicas type: integer format: int32 + web_liveness_initial_delay: + description: Initial delay before starting liveness checks on web pod + type: integer + default: 5 + format: int32 + task_liveness_initial_delay: + description: Initial delay before starting liveness checks on task pod + type: integer + default: 5 + format: int32 + web_liveness_period: + description: Time period in seconds between each liveness check for the web pod + type: integer + default: 0 + format: int32 + task_liveness_period: + description: Time period in seconds between each liveness check for the task pod + type: integer + default: 0 + format: int32 + web_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_liveness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_liveness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 + web_readiness_initial_delay: + description: Initial delay before starting readiness checks on web pod + type: integer + default: 20 + format: int32 + task_readiness_initial_delay: + description: Initial delay before starting readiness checks on task pod + type: integer + default: 20 + format: int32 + web_readiness_period: + description: Time period in seconds between each readiness check for the web pod + type: integer + default: 0 + format: int32 + task_readiness_period: + description: Time period in seconds between each readiness check for the task pod + type: integer + default: 0 + format: int32 + web_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_readiness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_readiness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 garbage_collect_secrets: description: Whether or not to remove secrets upon instance removal default: false diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9d2af0ce2..d94943718 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -78,6 +78,17 @@ rules: - patch - update - watch + - apiGroups: + - batch + resources: + - jobs + verbs: + - get + - list + - create + - patch + - update + - watch - apiGroups: - monitoring.coreos.com resources: diff --git a/docs/user-guide/advanced-configuration/container-probes.md b/docs/user-guide/advanced-configuration/container-probes.md new file mode 100644 index 000000000..338e53773 --- /dev/null +++ b/docs/user-guide/advanced-configuration/container-probes.md @@ -0,0 +1,40 @@ +#### Container Probes +These parameters control the usage of liveness and readiness container probes for +the web and task containers. + +#### Web / Task Container Liveness Check + +The liveness probe queries the status of the supervisor daemon of the container. The probe will fail if it +detects one of the services in a state other than "RUNNING". + +| Name | Description | Default | +| web_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | +| task_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Web Container Readiness Check + +This is a HTTP check against the status endpoint to confirm the system is still able to respond to web requests. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| web_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Task Container Readiness Check + +This is a command probe using the builtin check command of the awx-manage utility. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| task_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | diff --git a/roles/installer/tasks/initialize_django.yml b/roles/installer/tasks/initialize_django.yml index 158a77573..3dad2cb4c 100644 --- a/roles/installer/tasks/initialize_django.yml +++ b/roles/installer/tasks/initialize_django.yml @@ -2,8 +2,8 @@ - name: Check if there are any super users defined. k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "echo 'from django.contrib.auth.models import User; nsu = User.objects.filter(is_superuser=True, username=\"{{ admin_user }}\").count(); @@ -16,8 +16,8 @@ - name: Create super user via Django if it doesn't exist. k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: awx-manage createsuperuser --username={{ admin_user | quote }} --email={{ admin_email | quote }} --noinput register: result changed_when: "'That username is already taken' not in result.stderr" @@ -28,8 +28,8 @@ - name: Update Django super user password k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: awx-manage update_password --username='{{ admin_user }}' --password='{{ admin_password }}' register: result changed_when: "'Password updated' in result.stdout" @@ -39,8 +39,8 @@ - name: Check if legacy queue is present k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage list_instances | grep '^\[tower capacity=[0-9]*\]'" register: legacy_queue @@ -50,8 +50,8 @@ - name: Unregister legacy queue k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage unregister_queue --queuename=tower" when: "'[tower capacity=' in legacy_queue.stdout" @@ -74,8 +74,8 @@ - name: Register default execution environments (without authentication) k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage register_default_execution_environments" register: ree @@ -95,8 +95,8 @@ - name: Register default execution environments (with authentication) k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage register_default_execution_environments --registry-username='{{ default_execution_environment_pull_credentials_user }}' @@ -111,8 +111,8 @@ - name: Create preload data if necessary. # noqa 305 k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage create_preload_data" register: cdo diff --git a/roles/installer/tasks/install.yml b/roles/installer/tasks/install.yml index 2398ebb4d..caa9f9f8c 100644 --- a/roles/installer/tasks/install.yml +++ b/roles/installer/tasks/install.yml @@ -94,51 +94,13 @@ - name: Include resources configuration tasks include_tasks: resources_configuration.yml -- name: Check for pending migrations - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: >- - bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l" - changed_when: false - when: awx_task_pod_name != '' - register: database_check - -- name: Migrate the database if the K8s resources were updated # noqa 305 - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: | - bash -c " - function end_keepalive { - rc=$? - rm -f \"$1\" - kill $(cat /proc/$2/task/$2/children 2>/dev/null) 2>/dev/null || true - wait $2 || true - exit $rc - } - keepalive_file=\"$(mktemp)\" - while [[ -f \"$keepalive_file\" ]]; do - echo 'Database schema migration in progress...' - sleep 60 - done & - keepalive_pid=$! - trap 'end_keepalive \"$keepalive_file\" \"$keepalive_pid\"' EXIT SIGINT SIGTERM - echo keepalive_pid: $keepalive_pid - awx-manage migrate --noinput - echo 'Successful' - " - register: migrate_result - when: - - awx_task_pod_name != '' - - database_check is defined - - (database_check.stdout|trim) != '0' +- name: Migrate database to the latest schema + include_tasks: migrate_schema.yml + when: awx_web_pod_name != '' - name: Initialize Django include_tasks: initialize_django.yml - when: awx_task_pod_name != '' + when: awx_web_pod_name != '' - name: Update status variables include_tasks: update_status.yml diff --git a/roles/installer/tasks/migrate_schema.yml b/roles/installer/tasks/migrate_schema.yml new file mode 100644 index 000000000..ef41e1827 --- /dev/null +++ b/roles/installer/tasks/migrate_schema.yml @@ -0,0 +1,57 @@ +--- + +- name: Check for pending migrations + k8s_exec: + namespace: "{{ ansible_operator_meta.namespace }}" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" + command: >- + bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l" + changed_when: false + when: awx_web_pod_name != '' + register: database_check + +- block: + - name: Get version of controller for tracking + k8s_exec: + namespace: "{{ ansible_operator_meta.namespace }}" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" + command: >- + bash -c "awx-manage --version" + changed_when: false + register: version_check + + - name: Update instance version + set_fact: + version: "{{ version_check.stdout | trim }}" + + # It is possible to do a wait on this task to create the job and wait + # until it completes. Unfortunately, if the job doesn't wait finish within + # the timeout period that is considered an error. We only want this to + # error if there is an issue with creating the job. + - name: Create kubernetes job to perform the migration + k8s: + apply: yes + definition: "{{ lookup('template', 'jobs/migration.yaml.j2') }}" + register: migrate_result + + # This task is really only necessary for new installations. We need to + # ensure the database has a schema loaded before continuing with the + # initialization of admin user, etc. + - name: Watch for the migration job to finish + k8s_info: + kind: Job + namespace: "{{ ansible_operator_meta.namespace }}" + name: "{{ ansible_operator_meta.name }}-migration-{{ version }}" + register: result + until: + - result.resources[0].status.succeeded is defined + - result.resources[0].status.succeeded == 1 + retries: 180 + delay: 5 + ignore_errors: true + + when: + - database_check is defined + - (database_check.stdout|trim) != '0' diff --git a/roles/installer/tasks/resources_configuration.yml b/roles/installer/tasks/resources_configuration.yml index c811aeeb7..aea1e8508 100644 --- a/roles/installer/tasks/resources_configuration.yml +++ b/roles/installer/tasks/resources_configuration.yml @@ -1,28 +1,28 @@ --- -- name: Get the current resource task pod information. +- name: Get the current resource web pod information. k8s_info: api_version: v1 kind: Pod namespace: '{{ ansible_operator_meta.namespace }}' label_selectors: - - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-task" + - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-web" - "app.kubernetes.io/managed-by={{ deployment_type }}-operator" - "app.kubernetes.io/component={{ deployment_type }}" field_selectors: - status.phase=Running - register: awx_task_pod + register: awx_web_pod - name: Set the resource pod as a variable. set_fact: - awx_task_pod: >- - {{ awx_task_pod['resources'] + awx_web_pod: >- + {{ awx_web_pod['resources'] | rejectattr('metadata.deletionTimestamp', 'defined') | sort(attribute='metadata.creationTimestamp') | first | default({}) }} - name: Set the resource pod name as a variable. set_fact: - awx_task_pod_name: "{{ awx_task_pod['metadata']['name'] | default('') }}" + awx_web_pod_name: "{{ awx_web_pod['metadata']['name'] | default('') }}" - name: Set user provided control plane ee image set_fact: @@ -249,11 +249,9 @@ k8s: apply: yes definition: "{{ lookup('template', 'deployments/{{ item }}.yaml.j2') }}" - wait: yes - wait_timeout: "{{ (120 * replicas) or 120 }}" loop: - - task - web + - task register: this_deployment_result - block: @@ -262,7 +260,7 @@ kind: Pod namespace: '{{ ansible_operator_meta.namespace }}' label_selectors: - - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-task" + - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-web" - "app.kubernetes.io/managed-by={{ deployment_type }}-operator" - "app.kubernetes.io/component={{ deployment_type }}" field_selectors: @@ -271,7 +269,7 @@ - name: Update new resource pod as a variable. set_fact: - awx_task_pod: >- + awx_web_pod: >- {{ _new_pod['resources'] | rejectattr('metadata.deletionTimestamp', 'defined') | sort(attribute='metadata.creationTimestamp') @@ -279,13 +277,13 @@ - name: Update new resource pod name as a variable. set_fact: - awx_task_pod_name: '{{ awx_task_pod["metadata"]["name"] | default("")}}' + awx_web_pod_name: '{{ awx_web_pod["metadata"]["name"] | default("")}}' when: - this_deployment_result.changed - name: Verify the resource pod name is populated. assert: that: - - awx_task_pod_name != '' + - awx_web_pod_name != '' fail_msg: "Could not find the tower pod's name." - when: task_replicas | int > 0 or (task_replicas == '' and replicas > 0) + when: web_replicas | int > 0 or (web_replicas == '' and replicas > 0) diff --git a/roles/installer/tasks/update_status.yml b/roles/installer/tasks/update_status.yml index 9f59b3644..1c0b7b6b4 100644 --- a/roles/installer/tasks/update_status.yml +++ b/roles/installer/tasks/update_status.yml @@ -47,13 +47,13 @@ - name: Retrieve instance version k8s_exec: namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" command: >- bash -c "awx-manage --version" register: instance_version changed_when: false - when: awx_task_pod_name != '' + when: awx_web_pod_name != '' - name: Update version status operator_sdk.util.k8s_status: @@ -111,5 +111,5 @@ name: "{{ ansible_operator_meta.name }}" namespace: "{{ ansible_operator_meta.namespace }}" status: - upgradedPostgresVersion: "{{ upgraded_postgres_version }}" + upgradedPostgresVersion: "{{ upgraded_postgres_version | string }}" when: upgraded_postgres_version is defined diff --git a/roles/installer/templates/deployments/task.yaml.j2 b/roles/installer/templates/deployments/task.yaml.j2 index cb57fbd13..af5c4b848 100644 --- a/roles/installer/templates/deployments/task.yaml.j2 +++ b/roles/installer/templates/deployments/task.yaml.j2 @@ -74,7 +74,28 @@ spec: priorityClassName: '{{ control_plane_priority_class }}' {% endif %} initContainers: - - name: init + - name: init-database + image: '{{ _image }}' + imagePullPolicy: '{{ image_pull_policy }}' + resources: {{ init_container_resource_requirements }} + command: + - /bin/sh + - -c + - wait-for-migrations + volumeMounts: + - name: {{ ansible_operator_meta.name }}-application-credentials + mountPath: "/etc/tower/conf.d/credentials.py" + subPath: credentials.py + readOnly: true + - name: "{{ secret_key_secret_name }}" + mountPath: /etc/tower/SECRET_KEY + subPath: SECRET_KEY + readOnly: true + - name: {{ ansible_operator_meta.name }}-settings + mountPath: "/etc/tower/settings.py" + subPath: settings.py + readOnly: true + - name: init-receptor image: '{{ _init_container_image }}' imagePullPolicy: '{{ image_pull_policy }}' resources: {{ init_container_resource_requirements }} @@ -188,6 +209,30 @@ spec: {% endif %} {% if task_args %} args: {{ task_args }} +{% endif %} +{% if task_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ task_liveness_initial_delay }} + periodSeconds: {{ task_liveness_period }} + failureThreshold: {{ task_liveness_failure_threshold }} + timeoutSeconds: {{ task_liveness_timeout }} +{% endif %} +{% if task_readiness_period|int > 0 %} + readinessProbe: + exec: + command: + - /usr/bin/awx-manage + - check + initialDelaySeconds: {{ task_readiness_initial_delay }} + periodSeconds: {{ task_readiness_period }} + failureThreshold: {{ task_readiness_failure_threshold }} + timeoutSeconds: {{ task_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %} diff --git a/roles/installer/templates/deployments/web.yaml.j2 b/roles/installer/templates/deployments/web.yaml.j2 index 1a7318348..568a6d690 100644 --- a/roles/installer/templates/deployments/web.yaml.j2 +++ b/roles/installer/templates/deployments/web.yaml.j2 @@ -162,6 +162,31 @@ spec: - containerPort: 8052 {% if ingress_type | lower == 'route' and route_tls_termination_mechanism | lower == 'passthrough' %} - containerPort: 8053 +{% endif %} +{% if web_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ web_liveness_initial_delay }} + periodSeconds: {{ web_liveness_period }} + failureThreshold: {{ web_liveness_failure_threshold }} + timeoutSeconds: {{ web_liveness_timeout }} +{% endif %} +{% if web_readiness_period|int > 0 %} + readinessProbe: + exec: + httpGet: + path: /api/v2/ping/ + scheme: HTTP + port: 8052 + initialDelaySeconds: {{ web_readiness_initial_delay }} + periodSeconds: {{ web_readiness_period }} + failureThreshold: {{ web_readiness_failure_threshold }} + timeoutSeconds: {{ web_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %} diff --git a/roles/installer/templates/jobs/migration.yaml.j2 b/roles/installer/templates/jobs/migration.yaml.j2 new file mode 100644 index 000000000..e306784c4 --- /dev/null +++ b/roles/installer/templates/jobs/migration.yaml.j2 @@ -0,0 +1,57 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: '{{ ansible_operator_meta.name }}-migration-{{ version }}' + namespace: '{{ ansible_operator_meta.namespace }}' + labels: + {{ lookup("template", "../common/templates/labels/common.yaml.j2") | indent(width=4) | trim }} + {{ lookup("template", "../common/templates/labels/version.yaml.j2") | indent(width=4) | trim }} +spec: + template: + spec: + containers: + - name: "migration-job" + image: '{{ _image }}' + command: + - /usr/bin/awx-manage + - migrate + - --noinput + volumeMounts: + - name: {{ ansible_operator_meta.name }}-application-credentials + mountPath: "/etc/tower/conf.d/credentials.py" + subPath: credentials.py + readOnly: true + - name: "{{ secret_key_secret_name }}" + mountPath: /etc/tower/SECRET_KEY + subPath: SECRET_KEY + readOnly: true + - name: {{ ansible_operator_meta.name }}-settings + mountPath: "/etc/tower/settings.py" + subPath: settings.py + readOnly: true + volumes: + - name: "{{ ansible_operator_meta.name }}-application-credentials" + secret: + secretName: "{{ ansible_operator_meta.name }}-app-credentials" + items: + - key: credentials.py + path: 'credentials.py' + - key: ldap.py + path: 'ldap.py' + - key: execution_environments.py + path: 'execution_environments.py' + - name: "{{ secret_key_secret_name }}" + secret: + secretName: '{{ secret_key_secret_name }}' + items: + - key: secret_key + path: SECRET_KEY + - name: {{ ansible_operator_meta.name }}-settings + configMap: + name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-configmap' + items: + - key: settings + path: settings.py + dnsPolicy: ClusterFirst + restartPolicy: Never + terminationGracePeriodSeconds: 30