diff --git a/dockers/docker-dhcp-relay/Dockerfile.j2 b/dockers/docker-dhcp-relay/Dockerfile.j2 index 0a760d301f3a..167d097d3af3 100644 --- a/dockers/docker-dhcp-relay/Dockerfile.j2 +++ b/dockers/docker-dhcp-relay/Dockerfile.j2 @@ -27,5 +27,7 @@ RUN rm -rf /debs COPY ["docker_init.sh", "start.sh", "/usr/bin/"] COPY ["docker-dhcp-relay.supervisord.conf.j2", "wait_for_intf.sh.j2", "/usr/share/sonic/templates/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor"] ENTRYPOINT ["/usr/bin/docker_init.sh"] diff --git a/dockers/docker-dhcp-relay/critical_processes b/dockers/docker-dhcp-relay/critical_processes new file mode 100644 index 000000000000..ddb183963a67 --- /dev/null +++ b/dockers/docker-dhcp-relay/critical_processes @@ -0,0 +1 @@ +isc-dhcp-relay diff --git a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 index 393589486399..000bc039dac3 100644 --- a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 +++ b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index 4ba905945131..cac0b24857b9 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -37,13 +37,22 @@ RUN pip install /python-wheels/{{ whl }} {% endif %} # Clean up -RUN apt-get purge -y python-pip -RUN apt-get clean -y -RUN apt-get autoclean -y -RUN apt-get autoremove -y -RUN rm -rf /debs /python-wheels ~/.cache COPY ["start.sh", "lm-sensors.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +RUN apt-get purge -y \ + python-pip && \ + apt-get clean -y && \ + apt-get autoclean -y && \ + apt-get autoremove -y && \ + rm -rf /debs \ + /python-wheels \ + ~/.cache + +COPY ["start.sh", "lm-sensors.sh", "/usr/bin/"] +COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor"] + ENTRYPOINT ["/usr/bin/supervisord"] diff --git a/dockers/docker-platform-monitor/critical_processes b/dockers/docker-platform-monitor/critical_processes new file mode 100644 index 000000000000..5f73b03ad146 --- /dev/null +++ b/dockers/docker-platform-monitor/critical_processes @@ -0,0 +1,3 @@ +fancontrol +ledd +xcvrd diff --git a/dockers/docker-platform-monitor/supervisord.conf b/dockers/docker-platform-monitor/supervisord.conf index aa947ce2c9ae..bb56022c06a5 100644 --- a/dockers/docker-platform-monitor/supervisord.conf +++ b/dockers/docker-platform-monitor/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index cf527772fd01..d501a663feba 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -3,12 +3,16 @@ Description=DHCP relay container Requires=updategraph.service After=updategraph.service swss.service syncd.service teamd.service Before=ntp-config.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 [Service] User={{ sonicadmin_user }} ExecStartPre=/usr/bin/{{ docker_container_name }}.sh start ExecStart=/usr/bin/{{ docker_container_name }}.sh wait ExecStop=/usr/bin/{{ docker_container_name }}.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/build_templates/pmon.service.j2 b/files/build_templates/pmon.service.j2 index 33f3173b4887..75ae699955e2 100644 --- a/files/build_templates/pmon.service.j2 +++ b/files/build_templates/pmon.service.j2 @@ -3,12 +3,16 @@ Description=Platform monitor container Requires=updategraph.service After=updategraph.service Before=ntp-config.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 [Service] User={{ sonicadmin_user }} ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 6bc62fc400c8..8d1735cd2b0c 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -33,9 +33,10 @@ def main(): expected = int(payload_headers['expected']) processname = payload_headers['processname'] + groupname = payload_headers['groupname'] # If a critical process exited unexpectedly, terminate supervisor - if expected == 0 and processname in critical_processes: + if expected == 0 and processname in critical_processes or groupname in critical_processes: MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) diff --git a/rules/docker-dhcp-relay.mk b/rules/docker-dhcp-relay.mk index 31c0529d03c9..13aa5b1a7ef6 100644 --- a/rules/docker-dhcp-relay.mk +++ b/rules/docker-dhcp-relay.mk @@ -22,3 +22,4 @@ SONIC_INSTALL_DOCKER_DBG_IMAGES += $(DOCKER_DHCP_RELAY_DBG) $(DOCKER_DHCP_RELAY)_CONTAINER_NAME = dhcp_relay $(DOCKER_DHCP_RELAY)_RUN_OPT += --net=host --privileged -t $(DOCKER_DHCP_RELAY)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro +$(DOCKER_DHCP_RELAY)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/rules/docker-platform-monitor.mk b/rules/docker-platform-monitor.mk index 59e97985d6bc..4d1cfee67dba 100644 --- a/rules/docker-platform-monitor.mk +++ b/rules/docker-platform-monitor.mk @@ -33,3 +33,4 @@ $(DOCKER_PLATFORM_MONITOR)_RUN_OPT += -v /usr/local/bin/act2_util:/usr/local/bin $(DOCKER_PLATFORM_MONITOR)_aboot_RUN_OPT += -v /usr/lib/python2.7/dist-packages/arista:/usr/lib/python2.7/dist-packages/arista:ro $(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += sensors:/usr/bin/sensors +$(DOCKER_PLATFORM_MONITOR)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf index 16ea6dda4346..a29982a646f4 100644 --- a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf +++ b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1