diff --git a/.env-wb-garbage-collector b/.env-wb-garbage-collector index 7db538834cd..2845aab3123 100644 --- a/.env-wb-garbage-collector +++ b/.env-wb-garbage-collector @@ -20,7 +20,7 @@ WEBSERVER_LOGIN=null #WEBSERVER_RESOURCE_MANAGER from .env WEBSERVER_SCICRUNCH=null WEBSERVER_STATICWEB=null -# WEBSERVER_STORAGE=null +# WEBSERVER_STORAGE needed to delete data when removing anonymous service WEBSERVER_STUDIES_DISPATCHER=null WEBSERVER_TRACING=null # -------- diff --git a/api/specs/webserver/openapi-diagnostics.yaml b/api/specs/webserver/openapi-diagnostics.yaml index 3019b617916..6dc5a043d88 100644 --- a/api/specs/webserver/openapi-diagnostics.yaml +++ b/api/specs/webserver/openapi-diagnostics.yaml @@ -3,8 +3,8 @@ paths: get: tags: - maintenance - summary: run check - operationId: check_running + summary: readiness probe for + operationId: healthcheck_readiness_probe responses: "200": description: Service information @@ -18,8 +18,8 @@ paths: get: tags: - maintenance - summary: health check - operationId: check_health + summary: liveliness probe + operationId: healthcheck_liveness_probe responses: "200": description: Service information diff --git a/services/web/Dockerfile b/services/web/Dockerfile index be483974542..4f7b411d7bb 100644 --- a/services/web/Dockerfile +++ b/services/web/Dockerfile @@ -121,8 +121,6 @@ ARG VCS_REF ENV SC_BUILD_TARGET=production \ SC_BOOT_MODE=production \ - SC_HEALTHCHECK_INTERVAL=30 \ - SC_HEALTHCHECK_RETRY=3 \ SC_BUILD_DATE=${BUILD_DATE} \ SC_VCS_URL=${VCS_URL} \ SC_VCS_REF=${VCS_REF} @@ -145,6 +143,9 @@ RUN apt-get update \ COPY --chown=scu:scu services/web/server/docker services/web/server/docker RUN chmod +x services/web/server/docker/*.sh + +# healthcheck. NOTE: do not forget to update variable +ENV SC_HEALTHCHECK_TIMEOUT=120s HEALTHCHECK --interval=30s \ --timeout=120s \ --start-period=30s \ diff --git a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml index 29287b7a1d3..556803ab1cb 100644 --- a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml +++ b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml @@ -55,8 +55,8 @@ paths: get: tags: - maintenance - summary: run check - operationId: check_running + summary: readiness probe for + operationId: healthcheck_readiness_probe responses: '200': description: Service information @@ -92,8 +92,8 @@ paths: get: tags: - maintenance - summary: health check - operationId: check_health + summary: liveliness probe + operationId: healthcheck_liveness_probe responses: '200': description: Service information diff --git a/services/web/server/src/simcore_service_webserver/application_settings.py b/services/web/server/src/simcore_service_webserver/application_settings.py index 0dc5a9b64ed..7308ed8c15e 100644 --- a/services/web/server/src/simcore_service_webserver/application_settings.py +++ b/services/web/server/src/simcore_service_webserver/application_settings.py @@ -12,6 +12,7 @@ ) from pydantic import validator from pydantic.fields import Field, ModelField +from pydantic.types import PositiveInt from settings_library.base import BaseCustomSettings from settings_library.email import SMTPSettings from settings_library.postgres import PostgresSettings @@ -58,8 +59,12 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): # @Dockerfile SC_BOOT_MODE: Optional[BootModeEnum] = None - SC_HEALTHCHECK_INTERVAL: Optional[int] = None - SC_HEALTHCHECK_RETRY: Optional[int] = None + SC_HEALTHCHECK_TIMEOUT: Optional[PositiveInt] = Field( + None, + description="If a single run of the check takes longer than timeout seconds " + "then the check is considered to have failed." + "It takes retries consecutive failures of the health check for the container to be considered unhealthy.", + ) SC_USER_ID: Optional[int] = None SC_USER_NAME: Optional[str] = None @@ -226,6 +231,19 @@ def log_level(self) -> int: def valid_log_level(cls, value) -> str: return cls.validate_log_level(value) + @validator("SC_HEALTHCHECK_TIMEOUT", pre=True) + @classmethod + def get_healthcheck_timeout_in_seconds(cls, v): + # Ex. HEALTHCHECK --interval=5m --timeout=3s + if isinstance(v, str): + if v.endswith("s"): + factor = 1 + elif v.endswith("m"): + factor = 60 + v = v.rstrip("ms") + return int(v) * factor + return v + # HELPERS -------------------------------------------------------- def is_enabled(self, field_name: str) -> bool: diff --git a/services/web/server/src/simcore_service_webserver/diagnostics.py b/services/web/server/src/simcore_service_webserver/diagnostics.py index 4def0033149..a9b08fff892 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics.py @@ -7,9 +7,15 @@ from servicelib.aiohttp.application_setup import ModuleCategory, app_module_setup from . import diagnostics_handlers -from .diagnostics_core import IncidentsRegistry, kINCIDENTS_REGISTRY, kPLUGIN_START_TIME +from .diagnostics_healthcheck import ( + IncidentsRegistry, + assert_healthy_app, + kINCIDENTS_REGISTRY, + kPLUGIN_START_TIME, +) from .diagnostics_monitoring import setup_monitoring from .diagnostics_settings import DiagnosticsSettings, get_plugin_settings +from .rest import HealthCheck log = logging.getLogger(__name__) @@ -37,6 +43,14 @@ def setup_diagnostics( # adds middleware and /metrics setup_monitoring(app) + # injects healthcheck + healthcheck: HealthCheck = app[HealthCheck.__name__] + + async def _on_healthcheck_async_adapter(app: web.Application): + assert_healthy_app(app) + + healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter) + # adds other diagnostic routes: healthcheck, etc app.router.add_routes(diagnostics_handlers.routes) diff --git a/services/web/server/src/simcore_service_webserver/diagnostics_handlers.py b/services/web/server/src/simcore_service_webserver/diagnostics_handlers.py index d7f5508afe8..16c5750c089 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics_handlers.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics_handlers.py @@ -12,8 +12,7 @@ from servicelib.utils import logged_gather from . import catalog_client, db, director_v2_api, storage_api -from ._meta import API_VERSION, APP_NAME, __version__, api_version_prefix -from .diagnostics_core import HealthError, assert_healthy_app +from ._meta import API_VERSION, APP_NAME, api_version_prefix from .login.decorators import login_required from .security_decorators import permission_required from .utils import get_task_info, get_tracemalloc_info @@ -23,23 +22,6 @@ routes = web.RouteTableDef() -@routes.get(f"/{api_version_prefix}/health", name="check_health") -async def get_app_health(request: web.Request): - # diagnostics of incidents - try: - assert_healthy_app(request.app) - except HealthError as err: - log.error("Unhealthy application: %s", err) - raise web.HTTPServiceUnavailable() - - data = { - "name": APP_NAME, - "version": __version__, - "api_version": API_VERSION, - } - return data - - @routes.get(f"/{api_version_prefix}/status/diagnostics", name="get_app_diagnostics") @login_required @permission_required("diagnostics.read") diff --git a/services/web/server/src/simcore_service_webserver/diagnostics_core.py b/services/web/server/src/simcore_service_webserver/diagnostics_healthcheck.py similarity index 91% rename from services/web/server/src/simcore_service_webserver/diagnostics_core.py rename to services/web/server/src/simcore_service_webserver/diagnostics_healthcheck.py index d49f1596595..38ac9a0a353 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics_core.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics_healthcheck.py @@ -8,6 +8,7 @@ from servicelib.aiohttp.incidents import LimitedOrderedStack, SlowCallback from .diagnostics_settings import get_plugin_settings +from .rest_healthcheck import HealthCheckFailed log = logging.getLogger(__name__) @@ -23,10 +24,6 @@ kSTART_SENSING_DELAY_SECS = f"{__name__}.start_sensing_delay" -class HealthError(Exception): - """Service is set as unhealty""" - - class IncidentsRegistry(LimitedOrderedStack[SlowCallback]): def max_delay(self) -> float: return self.max_item.delay_secs if self else 0 @@ -80,11 +77,11 @@ def is_sensing_enabled(app: web.Application): def assert_healthy_app(app: web.Application) -> None: - """Diagnostics function that determins whether - current application is healthy based on incidents - occured up to now + """Diagnostics function that determines whether the current + application is healthy based on incidents probed since it was started + until now. - raises DiagnosticError if any incient detected + raises HealthCheckFailed if any incient detected """ settings = get_plugin_settings(app) @@ -111,7 +108,7 @@ def assert_healthy_app(app: web.Application) -> None: max_delay, max_delay_allowed, ) - raise HealthError(msg) + raise HealthCheckFailed(msg) # CRITERIA 2: Mean latency of the last N request slower than 1 sec probe: Optional[DelayWindowProbe] = app.get(kLATENCY_PROBE) @@ -126,6 +123,6 @@ def assert_healthy_app(app: web.Application) -> None: ) if max_latency_allowed < latency: - raise HealthError( + raise HealthCheckFailed( f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs" ) diff --git a/services/web/server/src/simcore_service_webserver/diagnostics_monitoring.py b/services/web/server/src/simcore_service_webserver/diagnostics_monitoring.py index 7ce44e17e89..434fcf8f01a 100644 --- a/services/web/server/src/simcore_service_webserver/diagnostics_monitoring.py +++ b/services/web/server/src/simcore_service_webserver/diagnostics_monitoring.py @@ -10,7 +10,11 @@ from servicelib.aiohttp.monitoring import setup_monitoring as service_lib_setup from . import _meta -from .diagnostics_core import DelayWindowProbe, is_sensing_enabled, kLATENCY_PROBE +from .diagnostics_healthcheck import ( + DelayWindowProbe, + is_sensing_enabled, + kLATENCY_PROBE, +) log = logging.getLogger(__name__) diff --git a/services/web/server/src/simcore_service_webserver/rest.py b/services/web/server/src/simcore_service_webserver/rest.py index c4321da07be..4a350fdc2a8 100644 --- a/services/web/server/src/simcore_service_webserver/rest.py +++ b/services/web/server/src/simcore_service_webserver/rest.py @@ -20,6 +20,7 @@ from . import rest_handlers from ._constants import APP_OPENAPI_SPECS_KEY, APP_SETTINGS_KEY from ._meta import API_VTAG, api_version_prefix +from .rest_healthcheck import HealthCheck from .rest_settings import RestSettings, get_plugin_settings from .rest_utils import get_openapi_specs_path, load_openapi_specs @@ -58,20 +59,11 @@ def setup_rest(app: web.Application): f"__version__.api_version_prefix {api_version_prefix} does not fit openapi.yml version {specs.info.version}" ) + app[HealthCheck.__name__] = HealthCheck(app) + log.debug("Setup %s", f"{app[HealthCheck.__name__]=}") + # basic routes app.add_routes(rest_handlers.routes) - if not is_diagnostics_enabled: - # NOTE: the healthcheck route is normally in diagnostics, but - # if disabled, this plugin adds a simple version of it - app.add_routes( - [ - web.get( - path=f"/{api_version_prefix}/health", - handler=rest_handlers.check_running, - name="check_health", - ) - ] - ) # middlewares # NOTE: using safe get here since some tests use incomplete configs diff --git a/services/web/server/src/simcore_service_webserver/rest_handlers.py b/services/web/server/src/simcore_service_webserver/rest_handlers.py index bcca079e86e..b1ae2cd4c0a 100644 --- a/services/web/server/src/simcore_service_webserver/rest_handlers.py +++ b/services/web/server/src/simcore_service_webserver/rest_handlers.py @@ -1,4 +1,4 @@ -""" Basic diagnostic handles to the rest API for operations +""" Basic healthckeck and configuration handles to the rest API """ @@ -7,29 +7,53 @@ from aiohttp import web from servicelib.aiohttp.rest_utils import extract_and_validate -from ._meta import __version__, api_version_prefix +from ._meta import api_version_prefix from .application_settings import APP_SETTINGS_KEY +from .rest_healthcheck import HealthCheck, HealthCheckFailed log = logging.getLogger(__name__) routes = web.RouteTableDef() -@routes.get(f"/{api_version_prefix}/", name="check_running") -async def check_running(_request: web.Request): - # - # - This entry point is used as a fast way - # to check that the service is still running - # - Do not do add any expensive computatio here - # - Healthcheck has been moved to diagnostics module - # - data = { - "name": __name__.split(".")[0], - "version": f"{__version__}", - "status": "SERVICE_RUNNING", - "api_version": f"{__version__}", - } - return web.json_response(data={"data": data}) +@routes.get(f"/{api_version_prefix}/health", name="healthcheck_liveness_probe") +async def healthcheck_liveness_probe(request: web.Request): + """Liveness probe: "Check if the container is alive" + + This is checked by the containers orchestrator (docker swarm). When the service + is unhealthy, it will restart it so it can recover a working state. + + SEE doc in rest_healthcheck.py + """ + healthcheck: HealthCheck = request.app[HealthCheck.__name__] + + try: + # if slots append get too delayed, just timeout + health_report = await healthcheck.run(request.app) + except HealthCheckFailed as err: + log.warning("%s", err) + raise web.HTTPServiceUnavailable(reason="unhealthy") + + return web.json_response(data={"data": health_report}) + + +@routes.get(f"/{api_version_prefix}/", name="healthcheck_readiness_probe") +async def healthcheck_readiness_probe(request: web.Request): + """Readiness probe: "Check if the container is ready to receive traffic" + + When the target service is unhealthy, no traffic should be sent to it. Service discovery + services and load balancers (e.g. traefik) typically cut traffic from targets + in one way or another. + + SEE doc in rest_healthcheck.py + """ + + healthcheck: HealthCheck = request.app[HealthCheck.__name__] + health_report = healthcheck.get_app_info(request.app) + # NOTE: do NOT run healthcheck here, just return info fast. + health_report["status"] = "SERVICE_RUNNING" + + return web.json_response(data={"data": health_report}) @routes.get(f"/{api_version_prefix}/config", name="get_config") diff --git a/services/web/server/src/simcore_service_webserver/rest_healthcheck.py b/services/web/server/src/simcore_service_webserver/rest_healthcheck.py new file mode 100644 index 00000000000..d6f41621b38 --- /dev/null +++ b/services/web/server/src/simcore_service_webserver/rest_healthcheck.py @@ -0,0 +1,126 @@ +""" Service healthcheck + + +## Types of health checks + +Based on the service types, we can categorize health checks based on the actions they take. + +- Reboot: When the target is unhealthy, the target should be restarted to recover to a working state. +Container and VM orchestration platforms typically perform reboots. +- Cut traffic: When the target is unhealthy, no traffic should be sent to the target. Service discovery +services and load balancers typically cut traffic from targets in one way or another. + +The difference between these is that rebooting attempts to actively repair the target, while cutting +traffic leaves room for the target to repair itself. + +In Kubernetes health-checks are called *probes*: +- The health check for reboots is called a *liveness probe*: "Check if the container is alive". +- The health check for cutting traffic is called a *readiness probe*: "Check if the container is ready to receive traffic". + +Taken from https://medium.com/polarsquad/how-should-i-answer-a-health-check-aa1fcf6e858e + + +## docker healthchecks: + + --interval=DURATION (default: 30s) + --timeout=DURATION (default: 30s) + --start-period=DURATION (default: 0s) + --retries=N (default: 3) + + The health check will first run *interval* seconds after the container is started, and + then again *interval* seconds after each previous check completes. + + If a single run of the check takes longer than *timeout* seconds then the check is considered to have failed (SEE HealthCheckFailed). + + It takes *retries* consecutive failures of the health check for the container to be considered **unhealthy**. + + *start period* provides initialization time for containers that need time to bootstrap. Probe failure during + that period will not be counted towards the maximum number of retries. + + However, if a health check succeeds during the *start period*, the container is considered started and all consecutive + failures will be counted towards the maximum number of retries. + +Taken from https://docs.docker.com/engine/reference/builder/#healthcheck +""" + + +import asyncio +import inspect +from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, Optional + +from aiohttp import web +from aiosignal import Signal + +from ._constants import APP_SETTINGS_KEY + +_HealthCheckSlot = Callable[[web.Application], Awaitable[None]] + +if TYPE_CHECKING: + _HealthCheckSignal = Signal[_HealthCheckSlot] + +else: + _HealthCheckSignal = Signal + + +class HealthCheckFailed(RuntimeError): + """Failed a health check + + NOTE: not the same as unhealthy. Check module's doc + """ + + +class HealthCheck: + def __init__(self, app: web.Application): + self._on_healthcheck = Signal(owner=self) # type: _HealthCheckSignal + + # The docker engine healthcheck: If a single run of the check takes longer than *timeout* seconds + # then the check is considered to have failed. Therefore there is no need to continue run + self._timeout: Optional[int] = app[APP_SETTINGS_KEY].SC_HEALTHCHECK_TIMEOUT + + def __repr__(self): + return "".format( + self._timeout, len(self._on_healthcheck) + ) + + @property + def on_healthcheck(self) -> _HealthCheckSignal: + """Signal to define a health check. + WARNING: can append **async** slot function. SEE _HealthCheckSlot + """ + return self._on_healthcheck + + @staticmethod + def get_app_info(app: web.Application): + """Minimal (header) health report is information about the app""" + settings = app[APP_SETTINGS_KEY] + return { + "name": settings.APP_NAME, + "version": settings.API_VERSION, + "api_version": settings.API_VERSION, + } + + async def run(self, app: web.Application) -> Dict[str, Any]: + """Runs all registered checks to determine the service health. + + can raise HealthCheckFailed + """ + # Ensures no more signals append after first run + self._on_healthcheck.freeze() + + assert all( # nosec + inspect.iscoroutinefunction(fun) for fun in self._on_healthcheck + ), "All Slot functions that append to on_healthcheck must be coroutines. SEE _HealthCheckSlot" + + try: + heath_report: Dict[str, Any] = self.get_app_info(app) + + # TODO: every signal could return some info on the health on each part + # that is appended on heath_report + await asyncio.wait_for( + self._on_healthcheck.send(app), timeout=self._timeout + ) + + return heath_report + + except asyncio.TimeoutError as err: + raise HealthCheckFailed("Service is slowing down") from err diff --git a/services/web/server/tests/unit/isolated/test_diagnostics.py b/services/web/server/tests/unit/isolated/test_diagnostics.py index e52c56dd630..9a322832c19 100644 --- a/services/web/server/tests/unit/isolated/test_diagnostics.py +++ b/services/web/server/tests/unit/isolated/test_diagnostics.py @@ -1,17 +1,16 @@ -# pylint:disable=unused-argument -# pylint:disable=redefined-outer-name -# pylint:disable=no-name-in-module +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable from typing import Dict from unittest.mock import Mock import pytest -from servicelib.aiohttp.application_keys import APP_OPENAPI_SPECS_KEY from servicelib.aiohttp.application_setup import APP_SETUP_COMPLETED_KEY from simcore_service_webserver import diagnostics_handlers from simcore_service_webserver.application_settings import setup_settings from simcore_service_webserver.diagnostics import setup_diagnostics -from simcore_service_webserver.rest import api_version_prefix +from simcore_service_webserver.rest import api_version_prefix, setup_rest class MockApp(dict): @@ -32,14 +31,16 @@ def __setitem__(self, key, value): def assert_none_overriden(self): assert not self._overriden + def add_routes(self, *args, **kwargs): + self.router.add_routes(*args, **kwargs) + @pytest.fixture def app_mock(openapi_specs): app = MockApp() - # some inits to emulate simcore_service_webserver.rest setup - app[APP_SETUP_COMPLETED_KEY] = ["simcore_service_webserver.rest"] - app[APP_OPENAPI_SPECS_KEY] = openapi_specs + # emulates security is initialized + app[APP_SETUP_COMPLETED_KEY] = ["simcore_service_webserver.security"] return app @@ -47,8 +48,8 @@ def app_mock(openapi_specs): def test_unique_application_keys( app_mock, openapi_specs, mock_env_devel_environment: Dict[str, str] ): - # this module has A LOT of constants and it is easy to override them setup_settings(app_mock) + setup_rest(app_mock) setup_diagnostics(app_mock) for key, value in app_mock.items(): @@ -56,6 +57,7 @@ def test_unique_application_keys( assert any(key for key in app_mock if "diagnostics" in key) + # this module has A LOT of constants and it is easy to override them app_mock.assert_none_overriden() diff --git a/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py b/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py index 961d5e562d3..4dfbd9f30ed 100644 --- a/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py +++ b/services/web/server/tests/unit/isolated/test_diagnostics_healthcheck.py @@ -1,3 +1,4 @@ +# pylint: disable=protected-access # pylint: disable=redefined-outer-name # pylint: disable=unused-argument # pylint: disable=unused-variable @@ -16,8 +17,8 @@ from simcore_service_webserver._constants import APP_SETTINGS_KEY from simcore_service_webserver.application_settings import setup_settings from simcore_service_webserver.diagnostics import setup_diagnostics -from simcore_service_webserver.diagnostics_core import ( - HealthError, +from simcore_service_webserver.diagnostics_healthcheck import ( + HealthCheckFailed, assert_healthy_app, kLATENCY_PROBE, ) @@ -31,7 +32,7 @@ def health_check_path(api_version_prefix) -> URL: - return f"/{api_version_prefix}/health" + return URL(f"/{api_version_prefix}/health") async def health_check_emulator( @@ -77,6 +78,7 @@ def mock_environment(mock_env_devel_environment: Dict[str, str], monkeypatch): monkeypatch.setenv("DIAGNOSTICS_MAX_TASK_DELAY", f"{SLOW_HANDLER_DELAY_SECS}") monkeypatch.setenv("DIAGNOSTICS_MAX_AVG_LATENCY", f"{2.0}") monkeypatch.setenv("DIAGNOSTICS_START_SENSING_DELAY", f"{0}") # inmidiately + monkeypatch.setenv("SC_HEALTHCHECK_TIMEOUT", "2m") @pytest.fixture @@ -214,7 +216,7 @@ async def test_diagnose_on_response_delays(client): assert latency_observed > tmax # diagnostics - with pytest.raises(HealthError): + with pytest.raises(HealthCheckFailed): assert_healthy_app(client.app) diff --git a/services/web/server/tests/unit/with_dbs/conftest.py b/services/web/server/tests/unit/with_dbs/conftest.py index 8071262e5ae..b08cb1511a9 100644 --- a/services/web/server/tests/unit/with_dbs/conftest.py +++ b/services/web/server/tests/unit/with_dbs/conftest.py @@ -25,7 +25,7 @@ import simcore_service_webserver.db_models as orm import simcore_service_webserver.utils import sqlalchemy as sa -from _helpers import MockedStorageSubsystem # type: ignore +from _helpers import MockedStorageSubsystem from _pytest.monkeypatch import MonkeyPatch from aiohttp import web from aiohttp.test_utils import TestClient, TestServer @@ -56,8 +56,6 @@ def disable_swagger_doc_generation( by not enabling the swagger documentation, 1.8s per test is gained """ monkeypatch.setenv("REST_SWAGGER_API_DOC_ENABLED", "0") - # TODO: after removing config, this might be used. - # monkeypatch.setenv("OSPARC_SIMCORE_REPO_ROOTDIR", f"{osparc_simcore_root_dir}") @pytest.fixture(scope="session")