diff --git a/config/crd/bases/awx.ansible.com_awxs.yaml b/config/crd/bases/awx.ansible.com_awxs.yaml index e3387b26e..48d4b1fe4 100644 --- a/config/crd/bases/awx.ansible.com_awxs.yaml +++ b/config/crd/bases/awx.ansible.com_awxs.yaml @@ -1571,6 +1571,86 @@ spec: description: Number of task instance replicas type: integer format: int32 + web_liveness_initial_delay: + description: Initial delay before starting liveness checks on web pod + type: integer + default: 5 + format: int32 + task_liveness_initial_delay: + description: Initial delay before starting liveness checks on task pod + type: integer + default: 5 + format: int32 + web_liveness_period: + description: Time period in seconds between each liveness check for the web pod + type: integer + default: 0 + format: int32 + task_liveness_period: + description: Time period in seconds between each liveness check for the task pod + type: integer + default: 0 + format: int32 + web_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_liveness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_liveness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_liveness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 + web_readiness_initial_delay: + description: Initial delay before starting readiness checks on web pod + type: integer + default: 20 + format: int32 + task_readiness_initial_delay: + description: Initial delay before starting readiness checks on task pod + type: integer + default: 20 + format: int32 + web_readiness_period: + description: Time period in seconds between each readiness check for the web pod + type: integer + default: 0 + format: int32 + task_readiness_period: + description: Time period in seconds between each readiness check for the task pod + type: integer + default: 0 + format: int32 + web_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of web pod + type: integer + default: 3 + format: int32 + task_readiness_failure_threshold: + description: Number of consecutive failure events to identify failure of task pod + type: integer + default: 3 + format: int32 + web_readiness_timeout: + description: Number of seconds to wait for a probe response from web pod + type: integer + default: 1 + format: int32 + task_readiness_timeout: + description: Number of seconds to wait for a probe response from task pod + type: integer + default: 1 + format: int32 garbage_collect_secrets: description: Whether or not to remove secrets upon instance removal default: false diff --git a/docs/user-guide/advanced-configuration/container-probes.md b/docs/user-guide/advanced-configuration/container-probes.md new file mode 100644 index 000000000..338e53773 --- /dev/null +++ b/docs/user-guide/advanced-configuration/container-probes.md @@ -0,0 +1,40 @@ +#### Container Probes +These parameters control the usage of liveness and readiness container probes for +the web and task containers. + +#### Web / Task Container Liveness Check + +The liveness probe queries the status of the supervisor daemon of the container. The probe will fail if it +detects one of the services in a state other than "RUNNING". + +| Name | Description | Default | +| web_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | +| task_liveness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_liveness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_liveness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_liveness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Web Container Readiness Check + +This is a HTTP check against the status endpoint to confirm the system is still able to respond to web requests. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| web_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| web_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| web_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| web_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | + +#### Task Container Readiness Check + +This is a command probe using the builtin check command of the awx-manage utility. + +| Name | Description | Default | +| -------------| ---------------------------------- | ------- | +| task_readiness_period | Time period in seconds between each probe check. The value of 0 disables the probe. | 0 | +| task_readiness_initial_delay | Initial delay before starting probes in seconds | 5 | +| task_readiness_failure_threshold| Number of consecutive failure events to identify failure of container | 3 | +| task_readiness_timeout | Number of seconds to wait for a probe response from container | 1 | diff --git a/roles/installer/tasks/install.yml b/roles/installer/tasks/install.yml index 2398ebb4d..540e86e18 100644 --- a/roles/installer/tasks/install.yml +++ b/roles/installer/tasks/install.yml @@ -94,48 +94,6 @@ - name: Include resources configuration tasks include_tasks: resources_configuration.yml -- name: Check for pending migrations - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: >- - bash -c "awx-manage showmigrations | grep -v '[X]' | grep '[ ]' | wc -l" - changed_when: false - when: awx_task_pod_name != '' - register: database_check - -- name: Migrate the database if the K8s resources were updated # noqa 305 - k8s_exec: - namespace: "{{ ansible_operator_meta.namespace }}" - pod: "{{ awx_task_pod_name }}" - container: "{{ ansible_operator_meta.name }}-task" - command: | - bash -c " - function end_keepalive { - rc=$? - rm -f \"$1\" - kill $(cat /proc/$2/task/$2/children 2>/dev/null) 2>/dev/null || true - wait $2 || true - exit $rc - } - keepalive_file=\"$(mktemp)\" - while [[ -f \"$keepalive_file\" ]]; do - echo 'Database schema migration in progress...' - sleep 60 - done & - keepalive_pid=$! - trap 'end_keepalive \"$keepalive_file\" \"$keepalive_pid\"' EXIT SIGINT SIGTERM - echo keepalive_pid: $keepalive_pid - awx-manage migrate --noinput - echo 'Successful' - " - register: migrate_result - when: - - awx_task_pod_name != '' - - database_check is defined - - (database_check.stdout|trim) != '0' - - name: Initialize Django include_tasks: initialize_django.yml when: awx_task_pod_name != '' diff --git a/roles/installer/tasks/resources_configuration.yml b/roles/installer/tasks/resources_configuration.yml index c811aeeb7..919640a45 100644 --- a/roles/installer/tasks/resources_configuration.yml +++ b/roles/installer/tasks/resources_configuration.yml @@ -12,7 +12,7 @@ - status.phase=Running register: awx_task_pod -- name: Set the resource pod as a variable. +- name: Set the resource task pod as a variable. set_fact: awx_task_pod: >- {{ awx_task_pod['resources'] @@ -20,7 +20,7 @@ | sort(attribute='metadata.creationTimestamp') | first | default({}) }} -- name: Set the resource pod name as a variable. +- name: Set the resource task pod name as a variable. set_fact: awx_task_pod_name: "{{ awx_task_pod['metadata']['name'] | default('') }}" @@ -249,15 +249,13 @@ k8s: apply: yes definition: "{{ lookup('template', 'deployments/{{ item }}.yaml.j2') }}" - wait: yes - wait_timeout: "{{ (120 * replicas) or 120 }}" loop: - task - web register: this_deployment_result - block: - - name: Get the new resource pod information after updating resource. + - name: Get the new task pod information after updating resource. k8s_info: kind: Pod namespace: '{{ ansible_operator_meta.namespace }}' @@ -266,10 +264,10 @@ - "app.kubernetes.io/managed-by={{ deployment_type }}-operator" - "app.kubernetes.io/component={{ deployment_type }}" field_selectors: - - status.phase=Running + - status.phase=Pending register: _new_pod - - name: Update new resource pod as a variable. + - name: Update new task pod as a variable. set_fact: awx_task_pod: >- {{ _new_pod['resources'] @@ -277,9 +275,49 @@ | sort(attribute='metadata.creationTimestamp') | last | default({}) }} - - name: Update new resource pod name as a variable. + - name: Update new task pod name as a variable. set_fact: awx_task_pod_name: '{{ awx_task_pod["metadata"]["name"] | default("")}}' + + - name: Get the new web pod information after updating resource. + k8s_info: + kind: Pod + namespace: '{{ ansible_operator_meta.namespace }}' + label_selectors: + - "app.kubernetes.io/name={{ ansible_operator_meta.name }}-web" + - "app.kubernetes.io/managed-by={{ deployment_type }}-operator" + - "app.kubernetes.io/component={{ deployment_type }}" + field_selectors: + - status.phase=Running + register: _new_pod + + - name: Update new web pod as a variable. + set_fact: + awx_web_pod: >- + {{ _new_pod['resources'] + | rejectattr('metadata.deletionTimestamp', 'defined') + | sort(attribute='metadata.creationTimestamp') + | last | default({}) }} + + - name: Update new web pod name as a variable. + set_fact: + awx_web_pod_name: '{{ awx_web_pod["metadata"]["name"] | default("")}}' + + # The only purpose of this task is to delay the initial installation + # of AWX until the database is properly populated with tables. + # The web containers will show a migration screen during this time. + # We use the web pod instead of the task pod for monitoring in + # order to simplify error handling. + - name: Wait for any migrations to finish before finishing installation # noqa 305 + k8s_exec: + namespace: "{{ ansible_operator_meta.namespace }}" + pod: "{{ awx_web_pod_name }}" + container: "{{ ansible_operator_meta.name }}-web" + command: "/bin/sh -c 'wait-for-migrations'" + register: migrate_result + when: + - awx_web_pod_name != '' + when: - this_deployment_result.changed diff --git a/roles/installer/templates/deployments/task.yaml.j2 b/roles/installer/templates/deployments/task.yaml.j2 index cb57fbd13..5adc13aac 100644 --- a/roles/installer/templates/deployments/task.yaml.j2 +++ b/roles/installer/templates/deployments/task.yaml.j2 @@ -74,7 +74,28 @@ spec: priorityClassName: '{{ control_plane_priority_class }}' {% endif %} initContainers: - - name: init + - name: init-database + image: '{{ _image }}' + imagePullPolicy: '{{ image_pull_policy }}' + resources: {{ init_container_resource_requirements }} + command: + - /usr/bin/awx-manage + - migrate + - --noinput + volumeMounts: + - name: {{ ansible_operator_meta.name }}-application-credentials + mountPath: "/etc/tower/conf.d/credentials.py" + subPath: credentials.py + readOnly: true + - name: "{{ secret_key_secret_name }}" + mountPath: /etc/tower/SECRET_KEY + subPath: SECRET_KEY + readOnly: true + - name: {{ ansible_operator_meta.name }}-settings + mountPath: "/etc/tower/settings.py" + subPath: settings.py + readOnly: true + - name: init-receptor image: '{{ _init_container_image }}' imagePullPolicy: '{{ image_pull_policy }}' resources: {{ init_container_resource_requirements }} @@ -188,6 +209,30 @@ spec: {% endif %} {% if task_args %} args: {{ task_args }} +{% endif %} +{% if task_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ task_liveness_initial_delay }} + periodSeconds: {{ task_liveness_period }} + failureThreshold: {{ task_liveness_failure_threshold }} + timeoutSeconds: {{ task_liveness_timeout }} +{% endif %} +{% if task_readiness_period|int > 0 %} + readinessProbe: + exec: + command: + - /usr/bin/awx-manage + - check + initialDelaySeconds: {{ task_readiness_initial_delay }} + periodSeconds: {{ task_readiness_period }} + failureThreshold: {{ task_readiness_failure_threshold }} + timeoutSeconds: {{ task_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %} diff --git a/roles/installer/templates/deployments/web.yaml.j2 b/roles/installer/templates/deployments/web.yaml.j2 index 1a7318348..568a6d690 100644 --- a/roles/installer/templates/deployments/web.yaml.j2 +++ b/roles/installer/templates/deployments/web.yaml.j2 @@ -162,6 +162,31 @@ spec: - containerPort: 8052 {% if ingress_type | lower == 'route' and route_tls_termination_mechanism | lower == 'passthrough' %} - containerPort: 8053 +{% endif %} +{% if web_liveness_period|int > 0 %} + livenessProbe: + exec: + command: + - sh + - -c + - | + (exit $(/usr/bin/supervisorctl -c /etc/supervisord_task.conf status | grep -vc RUNNING)) + initialDelaySeconds: {{ web_liveness_initial_delay }} + periodSeconds: {{ web_liveness_period }} + failureThreshold: {{ web_liveness_failure_threshold }} + timeoutSeconds: {{ web_liveness_timeout }} +{% endif %} +{% if web_readiness_period|int > 0 %} + readinessProbe: + exec: + httpGet: + path: /api/v2/ping/ + scheme: HTTP + port: 8052 + initialDelaySeconds: {{ web_readiness_initial_delay }} + periodSeconds: {{ web_readiness_period }} + failureThreshold: {{ web_readiness_failure_threshold }} + timeoutSeconds: {{ web_readiness_timeout }} {% endif %} volumeMounts: {% if bundle_ca_crt %}