From 1fe8ae49317427491e20e5fdaca96e8318e1e68a Mon Sep 17 00:00:00 2001 From: junchao Date: Tue, 19 Oct 2021 12:07:27 +0800 Subject: [PATCH 1/9] No longer check citical process/service status by monit --- rules/system-health.mk | 1 + src/system-health/health_checker/manager.py | 17 +- .../health_checker/service_checker.py | 274 ++++++++++++++- src/system-health/health_checker/utils.py | 2 +- src/system-health/setup.py | 1 + .../tests/etc/supervisor/critical_processes | 2 + .../system_health_monitoring_config.json | 11 + src/system-health/tests/test_system_health.py | 317 +++++++++++++++++- 8 files changed, 595 insertions(+), 30 deletions(-) create mode 100644 src/system-health/tests/etc/supervisor/critical_processes create mode 100644 src/system-health/tests/system_health_monitoring_config.json diff --git a/rules/system-health.mk b/rules/system-health.mk index 00cd48858d32..2d1012fe1843 100644 --- a/rules/system-health.mk +++ b/rules/system-health.mk @@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl $(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health $(SYSTEM_HEALTH)_PYTHON_VERSION = 3 $(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3) +$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON) SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py index f31e482e336a..0b486bd1c94c 100644 --- a/src/system-health/health_checker/manager.py +++ b/src/system-health/health_checker/manager.py @@ -1,3 +1,11 @@ +from . import utils +from .config import Config +from .health_checker import HealthChecker +from .service_checker import ServiceChecker +from .hardware_checker import HardwareChecker +from .user_defined_checker import UserDefinedChecker + + class HealthCheckerManager(object): """ Manage all system health checkers and system health configuration. @@ -10,7 +18,6 @@ def __init__(self): self._checkers = [] self._state = self.STATE_BOOTING - from .config import Config self.config = Config() self.initialize() @@ -19,8 +26,6 @@ def initialize(self): Initialize the manager. Create service checker and hardware checker by default. :return: """ - from .service_checker import ServiceChecker - from .hardware_checker import HardwareChecker self._checkers.append(ServiceChecker()) self._checkers.append(HardwareChecker()) @@ -31,7 +36,6 @@ def check(self, chassis): :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that contains the status for all objects that was checked. """ - from .health_checker import HealthChecker HealthChecker.summary = HealthChecker.STATUS_OK stats = {} self.config.load_config() @@ -45,7 +49,6 @@ def check(self, chassis): self._do_check(checker, stats) if self.config.user_defined_checkers: - from .user_defined_checker import UserDefinedChecker for udc in self.config.user_defined_checkers: checker = UserDefinedChecker(udc) self._do_check(checker, stats) @@ -71,7 +74,6 @@ def _do_check(self, checker, stats): else: stats[category].update(info) except Exception as e: - from .health_checker import HealthChecker error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) entry = {str(checker): { HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, @@ -83,8 +85,7 @@ def _do_check(self, checker, stats): stats['Internal'].update(entry) def _is_system_booting(self): - from .utils import get_uptime - uptime = get_uptime() + uptime = utils.get_uptime() if not self.boot_timeout: self.boot_timeout = self.config.get_bootup_timeout() booting = uptime < self.boot_timeout diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index a98e2d33c3ad..37d52bfb5236 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -1,12 +1,32 @@ +import docker +import os +import pickle +import re + + +import swsssdk +from sonic_py_common import multi_asic +from sonic_py_common.logger import Logger from .health_checker import HealthChecker from . import utils +SYSLOG_IDENTIFIER = 'service_checker' +logger = Logger(log_identifier=SYSLOG_IDENTIFIER) + class ServiceChecker(HealthChecker): """ Checker that checks critical system service status via monit service. """ + # Cache file to save critical_process_dict + CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache' + + CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes' + + # Command to get merged directory of a container + GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"' + # Command to query the status of monit service. CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' @@ -21,9 +41,159 @@ class ServiceChecker(HealthChecker): 'Filesystem': 'Accessible', 'Program': 'Status ok' } - + def __init__(self): HealthChecker.__init__(self) + self.critical_process_dict = {} + # Containers that has invalid critical_processes file + self.bad_containers = set() + + self.container_feature_dict = {} + + self.need_save_cache = False + + self.load_critical_process_cache() + + def get_expected_runnning_container_set(self, feature_table): + """Get a set of containers that are expected to running on SONiC + + Args: + feature_table (object): FEATURE table in CONFIG_DB + + Returns: + set: A set of container names + """ + containers = set() + container_feature_dict = {} + for feature_name in feature_table.keys(): + if feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]: + if multi_asic.is_multi_asic(): + if feature_table[feature_name]["has_global_scope"] == "True": + containers.add(feature_name) + container_feature_dict[feature_name] = feature_name + if feature_table[feature_name]["has_per_asic_scope"] == "True": + num_asics = multi_asic.get_num_asics() + for asic_id in range(num_asics): + containers.add(feature_name + str(asic_id)) + container_feature_dict[feature_name + str(asic_id)] = feature_name + else: + containers.add(feature_name) + container_feature_dict[feature_name] = feature_name + + return containers, container_feature_dict + + def get_current_running_container_set(self): + """Get current running containers, if the running container is not in self.critical_process_dict, + try get the critical process list + + Returns: + set: A set of running containers + """ + DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') + running_containers = set() + ctrs = DOCKER_CLIENT.containers + try: + lst = ctrs.list(filters={"status": "running"}) + + for ctr in lst: + running_containers.add(ctr.name) + if ctr.name not in self.critical_process_dict: + self.get_critical_process_by_container(ctr.name) + except docker.errors.APIError as err: + logger.log_debug("Failed to retrieve the running container list. Error: '{}'".format(err)) + pass + return running_containers + + def get_critical_process_list_from_file(self, container, critical_processes_file): + """ + @summary: Read the critical processes from CRITICAL_PROCESSES_FILE. + @return: A list which contain critical processes. + """ + critical_process_list = [] + + with open(critical_processes_file, 'r') as file: + for line in file: + # ignore blank lines + if re.match(r"^\s*$", line): + continue + line_info = line.strip(' \n').split(':') + if len(line_info) != 2: + # Invalid syntax in critical_processes file, save it as an error + self.bad_containers.add(container) + logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) + continue + + identifier_key = line_info[0].strip() + identifier_value = line_info[1].strip() + if identifier_key == "program" and identifier_value: + # We only count lines like "program:" + critical_process_list.append(identifier_value) + + return critical_process_list + + def get_critical_process_by_container(self, container): + """Get critical process for a given container + + Args: + container (str): container name + """ + # Get container volumn folder + container_folder = self._get_container_folder(container) + if not container_folder: + logger.log_debug('Failed to get container folder for {}'.format(container_folder)) + return + + # If container folder does not exist, the container is probably not up, retry it + if not os.path.exists(container_folder): + logger.log_debug('Container folder does not exist: {}'.format(container_folder)) + return + + # Get critical_processes file path + critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH) + if not os.path.isfile(critical_processes_file): + # Critical process file does not exist, the container has no critical processes. + # This is fine, don't retry. + logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) + self._update_critical_process_dict(container, []) + return + + # Get critical process list from critical_processes + critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) + self._update_critical_process_dict(container, critical_process_list) + return + + def _update_critical_process_dict(self, container, critical_process_list): + self.critical_process_dict[container] = critical_process_list + self.need_save_cache = True + + def _get_container_folder(self, container): + return utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) + + def save_critical_process_cache(self): + """Save self.critical_process_dict to a cache file + """ + if not self.need_save_cache: + return + + self.need_save_cache = True + if not self.critical_process_dict: + # if critical_process_dict is empty, don't save it + return + + if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): + # if cache file exists, remove it + os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) + + with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f: + pickle.dump(self.critical_process_dict, f) + + def load_critical_process_cache(self): + if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE): + # cache file does not exist + return + + with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f: + self.critical_process_dict = pickle.load(f) def reset(self): self._info = {} @@ -31,16 +201,14 @@ def reset(self): def get_category(self): return 'Services' - def check(self, config): + def check_by_monit(self, config): """ - Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system, - process and file system. + et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any. :param config: Health checker configuration. :return: """ - self.reset() - output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() - if output != 'active': + output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) + if not output or output != 'active': self.set_object_not_ok('Service', 'monit', 'monit service is not running') return @@ -58,7 +226,7 @@ def check(self, config): for line in lines[2:]: name = line[0:status_begin].strip() - if config.ignore_services and name in config.ignore_services: + if config and config.ignore_services and name in config.ignore_services: continue status = line[status_begin:type_begin].strip() service_type = line[type_begin:].strip() @@ -70,3 +238,93 @@ def check(self, config): else: self.set_object_ok(service_type, name) return + + def check_services(self, config): + """Check status of critical services and critical processes + + Args: + config (config.Config): Health checker configuration. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + expected_running_containers, self.container_feature_dict = self.get_expected_runnning_container_set(feature_table) + current_running_containers = self.get_current_running_container_set() + self.save_critical_process_cache() + + not_running_containers = expected_running_containers.difference(current_running_containers) + for container in not_running_containers: + self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container)) + + if not self.critical_process_dict: + # Critical process is empty, not expect + self.set_object_not_ok('Service', 'system', 'no critical process found') + return + + for container, critical_process_list in self.critical_process_dict.items(): + self.check_process_existence(container, critical_process_list, config, feature_table) + + for bad_container in self.bad_containers: + self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect') + + def check(self, config): + """ + Check critical system service status. + :param config: Health checker configuration. + :return: + """ + self.reset() + self.check_by_monit(config) + self.check_services(config) + + + def _parse_supervisorctl_status(self, process_status): + """Expected input: + arp_update RUNNING pid 67, uptime 1:03:56 + buffermgrd RUNNING pid 81, uptime 1:03:56 + + Args: + process_status (list): List of process status + """ + data = {} + for line in process_status: + line = line.strip() + if not line: + continue + items = line.split() + data[items[0].strip()] = items[1].strip() + return data + + def check_process_existence(self, container_name, critical_process_list, config, feature_table): + """ + @summary: Check whether the process in the specified container is running or not. + """ + feature_name = self.container_feature_dict[container_name] + if feature_name in feature_table: + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[feature_name] + and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]): + + # We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because + # it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent", + # however, in supervisor.conf it is "/usr/bin/orchagent.sh" + cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name) + process_status = utils.run_command(cmd) + if process_status is None: + for process_name in critical_process_list: + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + return + + process_status = self._parse_supervisorctl_status(process_status.strip().splitlines()) + for process_name in critical_process_list: + if config and config.ignore_services and process_name in config.ignore_services: + continue + + # Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container. + # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. + if process_name in process_status: + if process_status[process_name] != 'RUNNING': + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + else: + self.set_object_ok('Process', '{}:{}'.format(container_name, process_name)) diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py index f310002e1e5f..14939a330539 100644 --- a/src/system-health/health_checker/utils.py +++ b/src/system-health/health_checker/utils.py @@ -9,7 +9,7 @@ def run_command(command): """ try: process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE) - return process.communicate()[0] + return process.communicate()[0].strip() except Exception: return None diff --git a/src/system-health/setup.py b/src/system-health/setup.py index b0cc998e482f..62b02252e8c0 100644 --- a/src/system-health/setup.py +++ b/src/system-health/setup.py @@ -3,6 +3,7 @@ dependencies = [ 'natsort', 'sonic_py_common', + 'docker' ] setup( diff --git a/src/system-health/tests/etc/supervisor/critical_processes b/src/system-health/tests/etc/supervisor/critical_processes new file mode 100644 index 000000000000..cff479d7fa11 --- /dev/null +++ b/src/system-health/tests/etc/supervisor/critical_processes @@ -0,0 +1,2 @@ +program:snmpd +program:snmp-subagent diff --git a/src/system-health/tests/system_health_monitoring_config.json b/src/system-health/tests/system_health_monitoring_config.json new file mode 100644 index 000000000000..7284be7d163b --- /dev/null +++ b/src/system-health/tests/system_health_monitoring_config.json @@ -0,0 +1,11 @@ +{ + "services_to_ignore": ["dummy_service"], + "devices_to_ignore": ["psu.voltage"], + "user_defined_checkers": [], + "polling_interval": 60, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + } +} diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 8e3d3c77e9f1..101f47dc36bc 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -8,6 +8,7 @@ 2. HealthCheckerManager 3. Config """ +import copy import os import sys from swsscommon import swsscommon @@ -30,11 +31,21 @@ from health_checker.service_checker import ServiceChecker from health_checker.user_defined_checker import UserDefinedChecker +mock_supervisorctl_output = """ +snmpd RUNNING pid 67, uptime 1:03:56 +snmp-subagent EXITED Oct 19 01:53 AM +""" device_info.get_platform = MagicMock(return_value='unittest') -def test_user_defined_checker(): - utils.run_command = MagicMock(return_value='') +def setup(): + if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): + os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) + + +@patch('health_checker.utils.run_command') +def test_user_defined_checker(mock_run): + mock_run.return_value = '' checker = UserDefinedChecker('') checker.check(None) @@ -43,29 +54,183 @@ def test_user_defined_checker(): checker.reset() assert len(checker._info) == 0 - utils.run_command = MagicMock(return_value='\n\n\n') + mock_run.return_value = '\n\n\n' checker.check(None) assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n' - utils.run_command = MagicMock(return_value=valid_output) + mock_run.return_value = valid_output checker.check(None) + assert checker.get_category() == 'MyCategory' assert 'Device1' in checker._info assert 'Device2' in checker._info assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK -def test_service_checker(): - return_value = '' +@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsssdk.ConfigDBConnector.get_table') +def test_service_checker_single_asic(mock_get_table, mock_run, mock_docker_client): + mock_get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + } + } + mock_containers = MagicMock() + mock_snmp_container = MagicMock() + mock_snmp_container.name = 'snmp' + mock_containers.list = MagicMock(return_value=[mock_snmp_container]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + + mock_run.return_value = mock_supervisorctl_output + + checker = ServiceChecker() + assert checker.get_category() == 'Services' + config = Config() + checker.check(config) + assert 'snmp:snmpd' in checker._info + assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'snmp:snmp-subagent' in checker._info + assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_get_table.return_value = { + 'new_service': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + }, + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + } + } + mock_ns_container = MagicMock() + mock_ns_container.name = 'new_service' + mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) + checker.check(config) + assert 'new_service' in checker.critical_process_dict + + assert 'new_service:snmpd' in checker._info + assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'new_service:snmp-subagent' in checker._info + assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_containers.list = MagicMock(return_value=[mock_snmp_container]) + checker.check(config) + assert 'new_service' in checker._info + assert checker._info['new_service'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) + mock_run.return_value = None + checker.check(config) + assert 'new_service:snmpd' in checker._info + assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'new_service:snmp-subagent' in checker._info + assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + origin_critical_process_dict = copy.deepcopy(checker.critical_process_dict) + checker.save_critical_process_cache() + checker.load_critical_process_cache() + assert origin_critical_process_dict == checker.critical_process_dict + + +@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) +@patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output)) +@patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=True)) +@patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)])) +@patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value='')) +@patch('docker.DockerClient') +@patch('swsssdk.ConfigDBConnector.get_table') +def test_service_checker_multi_asic(mock_get_table, mock_docker_client): + mock_get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'True', + + } + } + mock_containers = MagicMock() + mock_snmp_container = MagicMock() + mock_snmp_container.name = 'snmp' + list_return_value = [mock_snmp_container] + for i in range(3): + mock_container = MagicMock() + mock_container.name = 'snmp' + str(i) + list_return_value.append(mock_container) + + mock_containers.list = MagicMock(return_value=list_return_value) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers - def mock_run_command(cmd): - if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD: - return 'active' - else: - return return_value + checker = ServiceChecker() - utils.run_command = mock_run_command + config = Config() + checker.check(config) + assert 'snmp' in checker.critical_process_dict + assert 'snmp:snmpd' in checker._info + assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp0:snmpd' in checker._info + assert checker._info['snmp0:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp1:snmpd' in checker._info + assert checker._info['snmp1:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert 'snmp2:snmpd' in checker._info + assert checker._info['snmp2:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'snmp:snmp-subagent' in checker._info + assert checker._info['snmp:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp0:snmp-subagent' in checker._info + assert checker._info['snmp0:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp1:snmp-subagent' in checker._info + assert checker._info['snmp1:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + assert 'snmp2:snmp-subagent' in checker._info + assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + +@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock()) +@patch('docker.DockerClient') +@patch('swsssdk.ConfigDBConnector.get_table') +def test_service_checker_no_critical_process(mock_get_table, mock_docker_client): + mock_get_table.return_value = { + 'snmp': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'True', + + } + } + mock_containers = MagicMock() + mock_containers.list = MagicMock(return_value=[]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + + checker = ServiceChecker() + config = Config() + checker.check(config) + assert 'system' in checker._info + assert checker._info['system'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + +@patch('health_checker.service_checker.ServiceChecker.check_services', MagicMock()) +@patch('health_checker.utils.run_command') +def test_service_checker_check_by_monit(mock_run): return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \ 'Service Name Status Type\n' \ 'sonic Running System\n' \ @@ -74,7 +239,7 @@ def mock_run_command(cmd): 'orchagent Running Process\n' \ 'root-overlay Accessible Filesystem\n' \ 'var-log Is not accessible Filesystem\n' - + mock_run.side_effect = ['active', return_value] checker = ServiceChecker() config = Config() checker.check(config) @@ -185,6 +350,7 @@ def test_hardware_checker(): }) checker = HardwareChecker() + assert checker.get_category() == 'Hardware' config = Config() checker.check(config) @@ -217,3 +383,128 @@ def test_hardware_checker(): assert 'PSU 5' in checker._info assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_config(): + config = Config() + config._config_file = os.path.join(test_path, Config.CONFIG_FILE) + + assert config.config_file_exists() + config.load_config() + assert config.interval == 60 + assert 'dummy_service' in config.ignore_services + assert 'psu.voltage' in config.ignore_devices + assert len(config.user_defined_checkers) == 0 + + assert config.get_led_color('fault') == 'orange' + assert config.get_led_color('normal') == 'green' + assert config.get_led_color('booting') == 'orange_blink' + assert config.get_bootup_timeout() == 300 + + config._reset() + assert not config.ignore_services + assert not config.ignore_devices + assert not config.user_defined_checkers + assert not config.config_data + + assert config.get_led_color('fault') == 'red' + assert config.get_led_color('normal') == 'green' + assert config.get_led_color('booting') == 'orange_blink' + + config._last_mtime = 1 + config._config_file = 'notExistFile' + config.load_config() + assert not config._last_mtime + + +@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker.check', MagicMock()) +@patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock()) +@patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock()) +@patch('swsssdk.ConfigDBConnector.get_table', MagicMock()) +@patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine')) +@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info') +@patch('health_checker.service_checker.ServiceChecker.get_info') +@patch('health_checker.hardware_checker.HardwareChecker.get_info') +@patch('health_checker.utils.get_uptime') +def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info): + chassis = MagicMock() + chassis.set_status_led = MagicMock() + + manager = HealthCheckerManager() + manager.config.user_defined_checkers = ['some check'] + assert manager._state == HealthCheckerManager.STATE_BOOTING + assert len(manager._checkers) == 2 + + mock_uptime.return_value = 200 + assert manager._is_system_booting() + state, stat = manager.check(chassis) + assert state == HealthCheckerManager.STATE_BOOTING + assert len(stat) == 0 + chassis.set_status_led.assert_called_with('orange_blink') + + mock_uptime.return_value = 500 + assert not manager._is_system_booting() + assert manager._state == HealthCheckerManager.STATE_RUNNING + mock_hw_info.return_value = { + 'ASIC': { + 'type': 'ASIC', + 'message': '', + 'status': 'OK' + }, + 'fan1': { + 'type': 'Fan', + 'message': '', + 'status': 'OK' + }, + } + mock_service_info.return_value = { + 'snmp:snmpd': { + 'type': 'Process', + 'message': '', + 'status': 'OK' + } + } + mock_udc_info.return_value = { + 'udc': { + 'type': 'Database', + 'message': '', + 'status': 'OK' + } + } + state, stat = manager.check(chassis) + assert state == HealthCheckerManager.STATE_RUNNING + assert 'Services' in stat + assert stat['Services']['snmp:snmpd']['status'] == 'OK' + + assert 'Hardware' in stat + assert stat['Hardware']['ASIC']['status'] == 'OK' + assert stat['Hardware']['fan1']['status'] == 'OK' + + assert 'UserDefine' in stat + assert stat['UserDefine']['udc']['status'] == 'OK' + + mock_hw_info.side_effect = RuntimeError() + mock_service_info.side_effect = RuntimeError() + mock_udc_info.side_effect = RuntimeError() + state, stat = manager.check(chassis) + assert 'Internal' in stat + assert stat['Internal']['ServiceChecker']['status'] == 'Not OK' + assert stat['Internal']['HardwareChecker']['status'] == 'Not OK' + assert stat['Internal']['UserDefinedChecker - some check']['status'] == 'Not OK' + + chassis.set_status_led.side_effect = NotImplementedError() + manager._set_system_led(chassis, manager.config, 'normal') + + chassis.set_status_led.side_effect = RuntimeError() + manager._set_system_led(chassis, manager.config, 'normal') + +def test_utils(): + output = utils.run_command('some invalid command') + assert not output + + output = utils.run_command('ls') + assert output + + uptime = utils.get_uptime() + assert uptime > 0 From 90a4bf8be1d1da0d9bdeed9c38f9c0e622cdfb6c Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 27 Oct 2021 10:42:34 +0800 Subject: [PATCH 2/9] Fix review comments --- .../health_checker/service_checker.py | 51 +++++++++---------- src/system-health/tests/test_system_health.py | 35 +++++++++---- 2 files changed, 48 insertions(+), 38 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 37d52bfb5236..c81103e8c496 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -3,8 +3,7 @@ import pickle import re - -import swsssdk +from swsscommon import swsscommon from sonic_py_common import multi_asic from sonic_py_common.logger import Logger from .health_checker import HealthChecker @@ -41,7 +40,7 @@ class ServiceChecker(HealthChecker): 'Filesystem': 'Accessible', 'Program': 'Status ok' } - + def __init__(self): HealthChecker.__init__(self) self.critical_process_dict = {} @@ -65,13 +64,13 @@ def get_expected_runnning_container_set(self, feature_table): """ containers = set() container_feature_dict = {} - for feature_name in feature_table.keys(): - if feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]: + for feature_name, feature_entry in feature_table.items(): + if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): - if feature_table[feature_name]["has_global_scope"] == "True": + if feature_entry["has_global_scope"] == "True": containers.add(feature_name) container_feature_dict[feature_name] = feature_name - if feature_table[feature_name]["has_per_asic_scope"] == "True": + if feature_entry["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): containers.add(feature_name + str(asic_id)) @@ -98,10 +97,10 @@ def get_current_running_container_set(self): for ctr in lst: running_containers.add(ctr.name) if ctr.name not in self.critical_process_dict: - self.get_critical_process_by_container(ctr.name) + self.fill_critical_process_by_container(ctr.name) except docker.errors.APIError as err: logger.log_debug("Failed to retrieve the running container list. Error: '{}'".format(err)) - pass + return running_containers def get_critical_process_list_from_file(self, container, critical_processes_file): @@ -113,25 +112,22 @@ def get_critical_process_list_from_file(self, container, critical_processes_file with open(critical_processes_file, 'r') as file: for line in file: - # ignore blank lines - if re.match(r"^\s*$", line): - continue - line_info = line.strip(' \n').split(':') - if len(line_info) != 2: - # Invalid syntax in critical_processes file, save it as an error - self.bad_containers.add(container) - logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) + # Try to match a line like "program:" + match = re.match(r"^\s*((.+):(.*))*\s*$", line) + if match is None: + if container not in self.bad_containers: + self.bad_containers.add(container) + logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) continue - identifier_key = line_info[0].strip() - identifier_value = line_info[1].strip() + identifier_key = match.group(2) + identifier_value = match.group(3) if identifier_key == "program" and identifier_value: - # We only count lines like "program:" critical_process_list.append(identifier_value) return critical_process_list - def get_critical_process_by_container(self, container): + def fill_critical_process_by_container(self, container): """Get critical process for a given container Args: @@ -160,7 +156,6 @@ def get_critical_process_by_container(self, container): # Get critical process list from critical_processes critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) self._update_critical_process_dict(container, critical_process_list) - return def _update_critical_process_dict(self, container, critical_process_list): self.critical_process_dict[container] = critical_process_list @@ -175,7 +170,7 @@ def save_critical_process_cache(self): if not self.need_save_cache: return - self.need_save_cache = True + self.need_save_cache = False if not self.critical_process_dict: # if critical_process_dict is empty, don't save it return @@ -245,7 +240,7 @@ def check_services(self, config): Args: config (config.Config): Health checker configuration. """ - config_db = swsssdk.ConfigDBConnector() + config_db = swsscommon.ConfigDBConnector() config_db.connect() feature_table = config_db.get_table("FEATURE") expected_running_containers, self.container_feature_dict = self.get_expected_runnning_container_set(feature_table) @@ -269,14 +264,14 @@ def check_services(self, config): def check(self, config): """ - Check critical system service status. + Check critical system service status. :param config: Health checker configuration. :return: """ self.reset() self.check_by_monit(config) self.check_services(config) - + def _parse_supervisorctl_status(self, process_status): """Expected input: @@ -292,6 +287,8 @@ def _parse_supervisorctl_status(self, process_status): if not line: continue items = line.split() + if len(items) < 2: + continue data[items[0].strip()] = items[1].strip() return data @@ -315,7 +312,7 @@ def check_process_existence(self, container_name, critical_process_list, config, for process_name in critical_process_list: self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) return - + process_status = self._parse_supervisorctl_status(process_status.strip().splitlines()) for process_name in critical_process_list: if config and config.ignore_services and process_name in config.ignore_services: diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 101f47dc36bc..e7485555f4cd 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -68,13 +68,17 @@ def test_user_defined_checker(mock_run): assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK -@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) @patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) @patch('docker.DockerClient') @patch('health_checker.utils.run_command') -@patch('swsssdk.ConfigDBConnector.get_table') -def test_service_checker_single_asic(mock_get_table, mock_run, mock_docker_client): +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_client): + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data mock_get_table.return_value = { 'snmp': { 'state': 'enabled', @@ -148,7 +152,8 @@ def test_service_checker_single_asic(mock_get_table, mock_run, mock_docker_clien assert origin_critical_process_dict == checker.critical_process_dict -@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) + +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path)) @patch('health_checker.utils.run_command', MagicMock(return_value=mock_supervisorctl_output)) @patch('sonic_py_common.multi_asic.get_num_asics', MagicMock(return_value=3)) @@ -156,9 +161,13 @@ def test_service_checker_single_asic(mock_get_table, mock_run, mock_docker_clien @patch('sonic_py_common.multi_asic.get_namespace_list', MagicMock(return_value=[str(x) for x in range(3)])) @patch('sonic_py_common.multi_asic.get_current_namespace', MagicMock(return_value='')) @patch('docker.DockerClient') -@patch('swsssdk.ConfigDBConnector.get_table') -def test_service_checker_multi_asic(mock_get_table, mock_docker_client): - mock_get_table.return_value = { +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_multi_asic(mock_config_db, mock_docker_client): + mock_db_data = MagicMock() + mock_db_data.get_table = MagicMock() + mock_config_db.return_value = mock_db_data + + mock_db_data.get_table.return_value = { 'snmp': { 'state': 'enabled', 'has_global_scope': 'True', @@ -166,6 +175,7 @@ def test_service_checker_multi_asic(mock_get_table, mock_docker_client): } } + mock_containers = MagicMock() mock_snmp_container = MagicMock() mock_snmp_container.name = 'snmp' @@ -203,10 +213,12 @@ def test_service_checker_multi_asic(mock_get_table, mock_docker_client): assert 'snmp2:snmp-subagent' in checker._info assert checker._info['snmp2:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK -@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) + +@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker.check_by_monit', MagicMock()) @patch('docker.DockerClient') -@patch('swsssdk.ConfigDBConnector.get_table') +@patch('swsscommon.swsscommon.ConfigDBConnector.get_table') def test_service_checker_no_critical_process(mock_get_table, mock_docker_client): mock_get_table.return_value = { 'snmp': { @@ -417,11 +429,12 @@ def test_config(): assert not config._last_mtime -@patch('swsssdk.ConfigDBConnector.connect', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker.check', MagicMock()) @patch('health_checker.hardware_checker.HardwareChecker.check', MagicMock()) @patch('health_checker.user_defined_checker.UserDefinedChecker.check', MagicMock()) -@patch('swsssdk.ConfigDBConnector.get_table', MagicMock()) +@patch('swsscommon.swsscommon.ConfigDBConnector.get_table', MagicMock()) @patch('health_checker.user_defined_checker.UserDefinedChecker.get_category', MagicMock(return_value='UserDefine')) @patch('health_checker.user_defined_checker.UserDefinedChecker.get_info') @patch('health_checker.service_checker.ServiceChecker.get_info') From 68cc6cfc4edf47c29b5c1a75fbf298046120cf6c Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 29 Oct 2021 09:35:53 +0800 Subject: [PATCH 3/9] Fix review comment --- src/system-health/health_checker/service_checker.py | 8 ++++++-- src/system-health/health_checker/utils.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index c81103e8c496..f3c832bac2b2 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -162,7 +162,11 @@ def _update_critical_process_dict(self, container, critical_process_list): self.need_save_cache = True def _get_container_folder(self, container): - return utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) + container_folder = utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container)) + if container_folder is None: + return container_folder + + return container_folder.strip() def save_critical_process_cache(self): """Save self.critical_process_dict to a cache file @@ -203,7 +207,7 @@ def check_by_monit(self, config): :return: """ output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) - if not output or output != 'active': + if output != 'active': self.set_object_not_ok('Service', 'monit', 'monit service is not running') return diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py index 14939a330539..f310002e1e5f 100644 --- a/src/system-health/health_checker/utils.py +++ b/src/system-health/health_checker/utils.py @@ -9,7 +9,7 @@ def run_command(command): """ try: process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE) - return process.communicate()[0].strip() + return process.communicate()[0] except Exception: return None From ac79a808a1b4d135fef14d047b18feb9003b8042 Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 5 Nov 2021 18:23:28 +0800 Subject: [PATCH 4/9] Fix issue found in test --- src/system-health/health_checker/service_checker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index f3c832bac2b2..223c227185f7 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -207,7 +207,7 @@ def check_by_monit(self, config): :return: """ output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD) - if output != 'active': + if not output or output.strip() != 'active': self.set_object_not_ok('Service', 'monit', 'monit service is not running') return @@ -249,6 +249,11 @@ def check_services(self, config): feature_table = config_db.get_table("FEATURE") expected_running_containers, self.container_feature_dict = self.get_expected_runnning_container_set(feature_table) current_running_containers = self.get_current_running_container_set() + + newly_disabled_containers = set(self.critical_process_dict.keys()).difference(expected_running_containers) + for newly_disabled_container in newly_disabled_containers: + self.critical_process_dict.pop(newly_disabled_container) + self.save_critical_process_cache() not_running_containers = expected_running_containers.difference(current_running_containers) From 7147bfdba3410bc75f915dbc91df1cd9a70dff88 Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 5 Nov 2021 18:29:38 +0800 Subject: [PATCH 5/9] Set type to internal when got exception while check --- src/system-health/health_checker/manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py index 0b486bd1c94c..6e07ed3b329d 100644 --- a/src/system-health/health_checker/manager.py +++ b/src/system-health/health_checker/manager.py @@ -77,7 +77,8 @@ def _do_check(self, checker, stats): error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) entry = {str(checker): { HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, - HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg + HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg, + HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal" }} if 'Internal' not in stats: stats['Internal'] = entry From e7f1d15f27c3dd97f4fe618a16ec822626ec779f Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 5 Nov 2021 19:34:33 +0800 Subject: [PATCH 6/9] Set summary to not ok if get internal error --- src/system-health/health_checker/manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py index 6e07ed3b329d..f6bc31f874e5 100644 --- a/src/system-health/health_checker/manager.py +++ b/src/system-health/health_checker/manager.py @@ -74,6 +74,7 @@ def _do_check(self, checker, stats): else: stats[category].update(info) except Exception as e: + HealthChecker.summary = HealthChecker.STATUS_NOT_OK error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) entry = {str(checker): { HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, From fd76ef7be00a8eef073449b1725d820d7088074e Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 11 Nov 2021 14:31:01 +0800 Subject: [PATCH 7/9] Fix review comment --- .../health_checker/service_checker.py | 68 ++++++++++--------- src/system-health/tests/test_system_health.py | 8 +-- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 223c227185f7..7da32bc64c0c 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -18,7 +18,7 @@ class ServiceChecker(HealthChecker): Checker that checks critical system service status via monit service. """ - # Cache file to save critical_process_dict + # Cache file to save container_critical_processes CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache' CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes' @@ -43,7 +43,7 @@ class ServiceChecker(HealthChecker): def __init__(self): HealthChecker.__init__(self) - self.critical_process_dict = {} + self.container_critical_processes = {} # Containers that has invalid critical_processes file self.bad_containers = set() @@ -53,40 +53,41 @@ def __init__(self): self.load_critical_process_cache() - def get_expected_runnning_container_set(self, feature_table): + def get_expected_running_containers(self, feature_table): """Get a set of containers that are expected to running on SONiC Args: feature_table (object): FEATURE table in CONFIG_DB Returns: - set: A set of container names + expected_running_containers: A set of container names that are expected running + container_feature_dict: A dictionary {:} """ - containers = set() + expected_running_containers = set() container_feature_dict = {} for feature_name, feature_entry in feature_table.items(): if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): if feature_entry["has_global_scope"] == "True": - containers.add(feature_name) + expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name if feature_entry["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - containers.add(feature_name + str(asic_id)) + expected_running_containers.add(feature_name + str(asic_id)) container_feature_dict[feature_name + str(asic_id)] = feature_name else: - containers.add(feature_name) + expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name - return containers, container_feature_dict + return expected_running_containers, container_feature_dict - def get_current_running_container_set(self): - """Get current running containers, if the running container is not in self.critical_process_dict, + def get_current_running_containers(self): + """Get current running containers, if the running container is not in self.container_critical_processes, try get the critical process list Returns: - set: A set of running containers + running_containers: A set of running container names """ DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') running_containers = set() @@ -96,7 +97,7 @@ def get_current_running_container_set(self): for ctr in lst: running_containers.add(ctr.name) - if ctr.name not in self.critical_process_dict: + if ctr.name not in self.container_critical_processes: self.fill_critical_process_by_container(ctr.name) except docker.errors.APIError as err: logger.log_debug("Failed to retrieve the running container list. Error: '{}'".format(err)) @@ -104,9 +105,14 @@ def get_current_running_container_set(self): return running_containers def get_critical_process_list_from_file(self, container, critical_processes_file): - """ - @summary: Read the critical processes from CRITICAL_PROCESSES_FILE. - @return: A list which contain critical processes. + """Read critical process name list from critical processes file + + Args: + container (str): contianer name + critical_processes_file (str): critical processes file path + + Returns: + critical_process_list: A list of critical process names """ critical_process_list = [] @@ -150,15 +156,15 @@ def fill_critical_process_by_container(self, container): # Critical process file does not exist, the container has no critical processes. # This is fine, don't retry. logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) - self._update_critical_process_dict(container, []) + self._update_container_critical_processes(container, []) return # Get critical process list from critical_processes critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file) - self._update_critical_process_dict(container, critical_process_list) + self._update_container_critical_processes(container, critical_process_list) - def _update_critical_process_dict(self, container, critical_process_list): - self.critical_process_dict[container] = critical_process_list + def _update_container_critical_processes(self, container, critical_process_list): + self.container_critical_processes[container] = critical_process_list self.need_save_cache = True def _get_container_folder(self, container): @@ -169,14 +175,14 @@ def _get_container_folder(self, container): return container_folder.strip() def save_critical_process_cache(self): - """Save self.critical_process_dict to a cache file + """Save self.container_critical_processes to a cache file """ if not self.need_save_cache: return self.need_save_cache = False - if not self.critical_process_dict: - # if critical_process_dict is empty, don't save it + if not self.container_critical_processes: + # if container_critical_processes is empty, don't save it return if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE): @@ -184,7 +190,7 @@ def save_critical_process_cache(self): os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE) with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f: - pickle.dump(self.critical_process_dict, f) + pickle.dump(self.container_critical_processes, f) def load_critical_process_cache(self): if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE): @@ -192,7 +198,7 @@ def load_critical_process_cache(self): return with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f: - self.critical_process_dict = pickle.load(f) + self.container_critical_processes = pickle.load(f) def reset(self): self._info = {} @@ -247,12 +253,12 @@ def check_services(self, config): config_db = swsscommon.ConfigDBConnector() config_db.connect() feature_table = config_db.get_table("FEATURE") - expected_running_containers, self.container_feature_dict = self.get_expected_runnning_container_set(feature_table) - current_running_containers = self.get_current_running_container_set() + expected_running_containers, self.container_feature_dict = self.get_expected_running_containers(feature_table) + current_running_containers = self.get_current_running_containers() - newly_disabled_containers = set(self.critical_process_dict.keys()).difference(expected_running_containers) + newly_disabled_containers = set(self.container_critical_processes.keys()).difference(expected_running_containers) for newly_disabled_container in newly_disabled_containers: - self.critical_process_dict.pop(newly_disabled_container) + self.container_critical_processes.pop(newly_disabled_container) self.save_critical_process_cache() @@ -260,12 +266,12 @@ def check_services(self, config): for container in not_running_containers: self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container)) - if not self.critical_process_dict: + if not self.container_critical_processes: # Critical process is empty, not expect self.set_object_not_ok('Service', 'system', 'no critical process found') return - for container, critical_process_list in self.critical_process_dict.items(): + for container, critical_process_list in self.container_critical_processes.items(): self.check_process_existence(container, critical_process_list, config, feature_table) for bad_container in self.bad_containers: diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index e7485555f4cd..c7ff86c28b97 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -124,7 +124,7 @@ def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_clien mock_ns_container.name = 'new_service' mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_ns_container]) checker.check(config) - assert 'new_service' in checker.critical_process_dict + assert 'new_service' in checker.container_critical_processes assert 'new_service:snmpd' in checker._info assert checker._info['new_service:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK @@ -146,10 +146,10 @@ def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_clien assert 'new_service:snmp-subagent' in checker._info assert checker._info['new_service:snmp-subagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK - origin_critical_process_dict = copy.deepcopy(checker.critical_process_dict) + origin_container_critical_processes = copy.deepcopy(checker.container_critical_processes) checker.save_critical_process_cache() checker.load_critical_process_cache() - assert origin_critical_process_dict == checker.critical_process_dict + assert origin_container_critical_processes == checker.container_critical_processes @@ -194,7 +194,7 @@ def test_service_checker_multi_asic(mock_config_db, mock_docker_client): config = Config() checker.check(config) - assert 'snmp' in checker.critical_process_dict + assert 'snmp' in checker.container_critical_processes assert 'snmp:snmpd' in checker._info assert checker._info['snmp:snmpd'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK assert 'snmp0:snmpd' in checker._info From 0f8f348f56e87dad127a6e426fd91479720ba5bb Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Fri, 12 Nov 2021 08:52:24 +0800 Subject: [PATCH 8/9] Fix review commnet --- src/system-health/health_checker/service_checker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 7da32bc64c0c..82119502ef0a 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -126,8 +126,8 @@ def get_critical_process_list_from_file(self, container, critical_processes_file logger.log_error('Invalid syntax in critical_processes file of {}'.format(container)) continue - identifier_key = match.group(2) - identifier_value = match.group(3) + identifier_key = match.group(2).strip() + identifier_value = match.group(3).strip() if identifier_key == "program" and identifier_value: critical_process_list.append(identifier_value) From e7998430275aca29e8bb388f3a5865d2f3119c37 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 17 Nov 2021 13:10:58 +0800 Subject: [PATCH 9/9] Fix review comment --- .../health_checker/service_checker.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 82119502ef0a..171f7e2e8b4d 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -100,7 +100,7 @@ def get_current_running_containers(self): if ctr.name not in self.container_critical_processes: self.fill_critical_process_by_container(ctr.name) except docker.errors.APIError as err: - logger.log_debug("Failed to retrieve the running container list. Error: '{}'".format(err)) + logger.log_error("Failed to retrieve the running container list. Error: '{}'".format(err)) return running_containers @@ -142,19 +142,17 @@ def fill_critical_process_by_container(self, container): # Get container volumn folder container_folder = self._get_container_folder(container) if not container_folder: - logger.log_debug('Failed to get container folder for {}'.format(container_folder)) + logger.log_error('Failed to get container folder for {}'.format(container_folder)) return - # If container folder does not exist, the container is probably not up, retry it if not os.path.exists(container_folder): - logger.log_debug('Container folder does not exist: {}'.format(container_folder)) + logger.log_error('Container folder does not exist: {}'.format(container_folder)) return # Get critical_processes file path critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH) if not os.path.isfile(critical_processes_file): # Critical process file does not exist, the container has no critical processes. - # This is fine, don't retry. logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file)) self._update_container_critical_processes(container, []) return @@ -278,10 +276,10 @@ def check_services(self, config): self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect') def check(self, config): - """ - Check critical system service status. - :param config: Health checker configuration. - :return: + """Check critical system service status. + + Args: + config (object): Health checker configuration. """ self.reset() self.check_by_monit(config) @@ -308,8 +306,13 @@ def _parse_supervisorctl_status(self, process_status): return data def check_process_existence(self, container_name, critical_process_list, config, feature_table): - """ - @summary: Check whether the process in the specified container is running or not. + """Check whether the process in the specified container is running or not. + + Args: + container_name (str): Container name + critical_process_list (list): Critical processes + config (object): Health checker configuration. + feature_table (object): Feature table """ feature_name = self.container_feature_dict[container_name] if feature_name in feature_table: