forked from sonic-net/sonic-buildimage
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Monit] Restart telemetry container if memory usage is beyond the thr…
…eshold (sonic-net#7645) Signed-off-by: Yong Zhao [email protected] Why I did it This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold. How I did it I borrowed the system tool Monit to run a script memory_checker which will periodically check the memory usage of streaming telemetry container. If the memory usage of telemetry container is larger than the pre-defined threshold for 10 times during 20 cycles, then an alerting message will be written into syslog and at the same time Monit will run the script restart_service to restart the streaming telemetry container. How to verify it I verified this implementation on device str-7260cx3-acs-1.
- Loading branch information
Showing
4 changed files
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
memory_checker | ||
This script is part of the feature which will restart the container if memory | ||
usage of it is larger than the threshold value. | ||
This script is used to check the memory usage of specified cotnainer and | ||
is intended to be run by Monit. It will write an alerting message into | ||
syslog if memory usage of the container is larger than the threshold value for X | ||
times within Y cycles/minutes. Note that if print(...) statement in this script | ||
was executed, the string in it will be appended to Monit syslog messages. | ||
The following is an example in Monit configuration file to show how Monit will run | ||
this script: | ||
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>" | ||
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>" | ||
""" | ||
|
||
import argparse | ||
import subprocess | ||
import sys | ||
import syslog | ||
import re | ||
|
||
|
||
def get_command_result(command): | ||
"""Executes the command and return the resulting output. | ||
Args: | ||
command: A string contains the command to be executed. | ||
Returns: | ||
A string which contains the output of command. | ||
""" | ||
command_stdout = "" | ||
|
||
try: | ||
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, | ||
shell=True, universal_newlines=True) | ||
command_stdout, command_stderr = proc_instance.communicate() | ||
if proc_instance.returncode != 0: | ||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'" | ||
.format(command, proc_instance.returncode)) | ||
sys.exit(1) | ||
except (OSError, ValueError) as err: | ||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'" | ||
.format(command, err)) | ||
sys.exit(2) | ||
|
||
return command_stdout.strip() | ||
|
||
|
||
def check_memory_usage(container_name, threshold_value): | ||
"""Checks the memory usage of a container and writes an alerting messages into | ||
the syslog if the memory usage is larger than the threshold value. | ||
Args: | ||
container_name: A string represtents name of a container | ||
threshold_value: An integer indicates the threshold value (Bytes) of memory usage. | ||
Returns: | ||
None. | ||
""" | ||
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name) | ||
command_stdout = get_command_result(command) | ||
mem_usage = command_stdout.split("/")[0].strip() | ||
match_obj = re.match(r"\d+\.?\d*", mem_usage) | ||
if match_obj: | ||
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()]) | ||
mem_usage_unit = mem_usage[match_obj.end():] | ||
|
||
mem_usage_bytes = 0.0 | ||
if mem_usage_unit == "B": | ||
mem_usage_bytes = mem_usage_value | ||
elif mem_usage_unit == "KiB": | ||
mem_usage_bytes = mem_usage_value * 1024 | ||
elif mem_usage_unit == "MiB": | ||
mem_usage_bytes = mem_usage_value * 1024 ** 2 | ||
elif mem_usage_unit == "GiB": | ||
mem_usage_bytes = mem_usage_value * 1024 ** 3 | ||
|
||
if mem_usage_bytes > threshold_value: | ||
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" | ||
.format(container_name, mem_usage_bytes, threshold_value)) | ||
sys.exit(3) | ||
else: | ||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'" | ||
.format(mem_usage)) | ||
sys.exit(4) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Check memory usage of a container \ | ||
and an alerting message will be written into syslog if memory usage \ | ||
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>") | ||
parser.add_argument("container_name", help="container name") | ||
# TODO: Currently the threshold value is hard coded as a command line argument and will | ||
# remove this in the new version since we want to read this value from 'CONFIG_DB'. | ||
parser.add_argument("threshold_value", type=int, help="threshold value in bytes") | ||
args = parser.parse_args() | ||
|
||
check_memory_usage(args.container_name, args.threshold_value) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
restart_service | ||
This script is part of the feature which will restart the container if memory | ||
usage of it is larger than the threshold value. | ||
This script is intended to be run by Monit and is used to restart the specified | ||
container if the memory usage of it is larger than the threshold value for X | ||
times within Y cycles/minutes. | ||
The following is an example in Monit configuration file to show how Monit will run | ||
this script: | ||
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>" | ||
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>" | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import syslog | ||
import subprocess | ||
|
||
|
||
def get_command_result(command): | ||
"""Executes command and return the exit code, stdout and stderr. | ||
Args: | ||
command: A string contains the command to be executed. | ||
Returns: | ||
An integer contains the exit code. | ||
A string contains the output of stdout. | ||
A string contains the output of stderr. | ||
""" | ||
command_stdout = "" | ||
command_stderr = "" | ||
|
||
try: | ||
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, | ||
shell=True, universal_newlines=True) | ||
command_stdout, command_stderr = proc_instance.communicate() | ||
if proc_instance.returncode != 0: | ||
return 1, command_stdout.strip(), command_stderr.strip() | ||
except (OSError, ValueError) as err: | ||
return 2, command_stdout.strip(), err | ||
|
||
return 0, command_stdout.strip(), command_stderr.strip() | ||
|
||
|
||
def reset_failed_flag(service_name): | ||
"""Reset the failed status of a service. | ||
Args: | ||
service_name: Name of the service. | ||
Returns: | ||
None | ||
""" | ||
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name) | ||
|
||
syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..." | ||
.format(service_name)) | ||
|
||
exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command) | ||
if exit_code == 0: | ||
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'." | ||
.format(service_name)) | ||
else: | ||
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}" | ||
.format(service_name, command_stderr)) | ||
|
||
|
||
def restart_service(service_name): | ||
"""Reset the failed status of a service and then restart it. | ||
Args: | ||
service_name: Name of specified service. | ||
Returns: | ||
None. | ||
""" | ||
restart_command = "sudo systemctl restart {}.service".format(service_name) | ||
|
||
reset_failed_flag(service_name) | ||
|
||
syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name)) | ||
exit_code, command_stdout, command_stderr = get_command_result(restart_command) | ||
if exit_code != 0: | ||
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}" | ||
.format(service_name, command_stderr)) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Restart a specific service", | ||
usage="/usr/bin/restart_service <service_name>") | ||
parser.add_argument("service_name", help="service name") | ||
args = parser.parse_args() | ||
|
||
restart_service(args.service_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |