Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[system-health] No longer check critical process/service status via monit #9068

Merged
merged 9 commits into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rules/system-health.mk
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3)
$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON)
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)

export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
21 changes: 12 additions & 9 deletions src/system-health/health_checker/manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
from .user_defined_checker import UserDefinedChecker


class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
Expand All @@ -10,7 +18,6 @@ def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING

from .config import Config
self.config = Config()
self.initialize()

Expand All @@ -19,8 +26,6 @@ def initialize(self):
Initialize the manager. Create service checker and hardware checker by default.
:return:
"""
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
self._checkers.append(ServiceChecker())
self._checkers.append(HardwareChecker())

Expand All @@ -31,7 +36,6 @@ def check(self, chassis):
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
"""
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
Expand All @@ -45,7 +49,6 @@ def check(self, chassis):
self._do_check(checker, stats)

if self.config.user_defined_checkers:
from .user_defined_checker import UserDefinedChecker
for udc in self.config.user_defined_checkers:
checker = UserDefinedChecker(udc)
self._do_check(checker, stats)
Expand All @@ -71,20 +74,20 @@ def _do_check(self, checker, stats):
else:
stats[category].update(info)
except Exception as e:
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_NOT_OK
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
entry = {str(checker): {
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg
HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg,
HealthChecker.INFO_FIELD_OBJECT_TYPE: "Internal"
}}
if 'Internal' not in stats:
stats['Internal'] = entry
else:
stats['Internal'].update(entry)

def _is_system_booting(self):
from .utils import get_uptime
uptime = get_uptime()
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
Expand Down
Loading