Skip to content

Commit

Permalink
[Services] Restart DHCP-Relay service upon unexpected critical proces…
Browse files Browse the repository at this point in the history
…s exit. (sonic-net#3667)

Signed-off-by: Yong Zhao <[email protected]>

[Services] Restart Platform-monitor service upon unexpected critical process exit. (sonic-net#3689)
Signed-off-by: Yong Zhao <[email protected]>

Signed-off-by: Sangita Maity <[email protected]>

    RB=2126600
    G=lnos-reviewers
    R=pchaudha,pmao,vapatil,zxu
    A=zxu
  • Loading branch information
yozhao101 authored and samaity committed Jun 2, 2020
1 parent a3d571a commit 338d911
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 6 deletions.
2 changes: 2 additions & 0 deletions dockers/docker-dhcp-relay/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@ RUN rm -rf /debs

COPY ["docker_init.sh", "start.sh", "/usr/bin/"]
COPY ["docker-dhcp-relay.supervisord.conf.j2", "wait_for_intf.sh.j2", "/usr/share/sonic/templates/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor"]

ENTRYPOINT ["/usr/bin/docker_init.sh"]
1 change: 1 addition & 0 deletions dockers/docker-dhcp-relay/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
isc-dhcp-relay
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand Down
19 changes: 14 additions & 5 deletions dockers/docker-platform-monitor/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,22 @@ RUN pip install /python-wheels/{{ whl }}
{% endif %}

# Clean up
RUN apt-get purge -y python-pip
RUN apt-get clean -y
RUN apt-get autoclean -y
RUN apt-get autoremove -y
RUN rm -rf /debs /python-wheels ~/.cache

COPY ["start.sh", "lm-sensors.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]

RUN apt-get purge -y \
python-pip && \
apt-get clean -y && \
apt-get autoclean -y && \
apt-get autoremove -y && \
rm -rf /debs \
/python-wheels \
~/.cache

COPY ["start.sh", "lm-sensors.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor"]

ENTRYPOINT ["/usr/bin/supervisord"]
3 changes: 3 additions & 0 deletions dockers/docker-platform-monitor/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fancontrol
ledd
xcvrd
6 changes: 6 additions & 0 deletions dockers/docker-platform-monitor/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand Down
4 changes: 4 additions & 0 deletions files/build_templates/dhcp_relay.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@ Description=DHCP relay container
Requires=updategraph.service
After=updategraph.service swss.service syncd.service teamd.service
Before=ntp-config.service
StartLimitIntervalSec=1200
StartLimitBurst=3

[Service]
User={{ sonicadmin_user }}
ExecStartPre=/usr/bin/{{ docker_container_name }}.sh start
ExecStart=/usr/bin/{{ docker_container_name }}.sh wait
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target
4 changes: 4 additions & 0 deletions files/build_templates/pmon.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@ Description=Platform monitor container
Requires=updategraph.service
After=updategraph.service
Before=ntp-config.service
StartLimitIntervalSec=1200
StartLimitBurst=3

[Service]
User={{ sonicadmin_user }}
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
ExecStart=/usr/bin/{{docker_container_name}}.sh wait
ExecStop=/usr/bin/{{docker_container_name}}.sh stop
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target
3 changes: 2 additions & 1 deletion files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ def main():

expected = int(payload_headers['expected'])
processname = payload_headers['processname']
groupname = payload_headers['groupname']

# If a critical process exited unexpectedly, terminate supervisor
if expected == 0 and processname in critical_processes:
if expected == 0 and processname in critical_processes or groupname in critical_processes:
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
Expand Down
1 change: 1 addition & 0 deletions rules/docker-dhcp-relay.mk
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ SONIC_INSTALL_DOCKER_DBG_IMAGES += $(DOCKER_DHCP_RELAY_DBG)
$(DOCKER_DHCP_RELAY)_CONTAINER_NAME = dhcp_relay
$(DOCKER_DHCP_RELAY)_RUN_OPT += --net=host --privileged -t
$(DOCKER_DHCP_RELAY)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_DHCP_RELAY)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
1 change: 1 addition & 0 deletions rules/docker-platform-monitor.mk
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ $(DOCKER_PLATFORM_MONITOR)_RUN_OPT += -v /usr/local/bin/act2_util:/usr/local/bin
$(DOCKER_PLATFORM_MONITOR)_aboot_RUN_OPT += -v /usr/lib/python2.7/dist-packages/arista:/usr/lib/python2.7/dist-packages/arista:ro

$(DOCKER_PLATFORM_MONITOR)_BASE_IMAGE_FILES += sensors:/usr/bin/sensors
$(DOCKER_PLATFORM_MONITOR)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand Down

0 comments on commit 338d911

Please sign in to comment.