From 5b662b5bc01baed2266bde2d73d58d10b18a0442 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 19:21:55 +0100 Subject: [PATCH 01/20] Isolate sleep-or-wait to one module, relax the required arguments --- kopf/engines/sleeping.py | 17 +++++--- kopf/reactor/activities.py | 6 +-- kopf/reactor/handling.py | 2 - kopf/reactor/processing.py | 12 ++++-- tests/handling/test_activity_triggering.py | 5 ++- tests/handling/test_delays.py | 5 +-- tests/test_sleeping.py | 47 ++++++++++++++++++---- 7 files changed, 68 insertions(+), 26 deletions(-) diff --git a/kopf/engines/sleeping.py b/kopf/engines/sleeping.py index dbca1f1c..5996c4e5 100644 --- a/kopf/engines/sleeping.py +++ b/kopf/engines/sleeping.py @@ -2,12 +2,13 @@ Advanced modes of sleeping. """ import asyncio -from typing import Optional +import collections +from typing import Optional, Collection, Union async def sleep_or_wait( - delay: float, - event: asyncio.Event, + delays: Union[None, float, Collection[Union[None, float]]], + event: Optional[asyncio.Event] = None, ) -> Optional[float]: """ Measure the sleep time: either until the timeout, or until the event is set. @@ -17,13 +18,19 @@ async def sleep_or_wait( In theory, the result can be ``0`` if the sleep was interrupted precisely the last moment before timing out; this is unlikely to happen though. """ + passed_delays = delays if isinstance(delays, collections.abc.Collection) else [delays] + actual_delays = [delay for delay in passed_delays if delay is not None] + minimal_delay = min(actual_delays) if actual_delays else 0 + + awakening_event = event if event is not None else asyncio.Event() + loop = asyncio.get_running_loop() try: start_time = loop.time() - await asyncio.wait_for(event.wait(), timeout=delay) + await asyncio.wait_for(awakening_event.wait(), timeout=minimal_delay) except asyncio.TimeoutError: return None # interruptable sleep is over: uninterrupted. else: end_time = loop.time() duration = end_time - start_time - return max(0, delay - duration) + return max(0, minimal_delay - duration) diff --git a/kopf/reactor/activities.py b/kopf/reactor/activities.py index d7a1ee80..ee77a049 100644 --- a/kopf/reactor/activities.py +++ b/kopf/reactor/activities.py @@ -16,7 +16,6 @@ * Specific authentication methods, such as the authentication piggybacking, belong to neither the reactor, nor the engines, nor the client wrappers. """ -import asyncio import logging from typing import NoReturn, Mapping, MutableMapping @@ -121,10 +120,7 @@ async def run_activity( ) outcomes.update(current_outcomes) state = state.with_outcomes(current_outcomes) - delay = state.delay - if delay: - limited_delay = min(delay, handling.WAITING_KEEPALIVE_INTERVAL) - await sleeping.sleep_or_wait(limited_delay, asyncio.Event()) + await sleeping.sleep_or_wait(state.delay) # Activities assume that all handlers must eventually succeed. # We raise from the 1st exception only: just to have something real in the tracebacks. diff --git a/kopf/reactor/handling.py b/kopf/reactor/handling.py index 1ed1438e..509ff30e 100644 --- a/kopf/reactor/handling.py +++ b/kopf/reactor/handling.py @@ -24,8 +24,6 @@ from kopf.structs import diffs from kopf.structs import handlers as handlers_ -WAITING_KEEPALIVE_INTERVAL = 10 * 60 -""" How often to wake up from the long sleep, to show liveness in the logs. """ DEFAULT_RETRY_DELAY = 1 * 60 """ The default delay duration for the regular exception in retry-mode. """ diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index 4c18111c..52ea7e0f 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -35,6 +35,9 @@ from kopf.structs import patches from kopf.structs import resources +# How often to wake up from the long sleep, to show liveness in the logs. +WAITING_KEEPALIVE_INTERVAL = 10 * 60 + async def process_resource_event( lifecycle: lifecycles.LifeCycleFn, @@ -137,10 +140,13 @@ async def process_resource_event( elif delay is None and not patch: logger.debug(f"Handling cycle is finished, waiting for new changes since now.") elif delay is not None: - if delay > 0: + if delay > WAITING_KEEPALIVE_INTERVAL: + limit = WAITING_KEEPALIVE_INTERVAL + logger.debug(f"Sleeping for {delay} (capped {limit}) seconds for the delayed handlers.") + unslept_delay = await sleeping.sleep_or_wait(limit, replenished) + elif delay > 0: logger.debug(f"Sleeping for {delay} seconds for the delayed handlers.") - limited_delay = min(delay, handling.WAITING_KEEPALIVE_INTERVAL) - unslept_delay = await sleeping.sleep_or_wait(limited_delay, replenished) + unslept_delay = await sleeping.sleep_or_wait(delay, replenished) else: unslept_delay = None # no need to sleep? means: slept in full. diff --git a/tests/handling/test_activity_triggering.py b/tests/handling/test_activity_triggering.py index 8e596a17..376e870a 100644 --- a/tests/handling/test_activity_triggering.py +++ b/tests/handling/test_activity_triggering.py @@ -170,4 +170,7 @@ async def sleep_or_wait_substitute(*_, **__): lifecycle=all_at_once, ) - assert sleep_or_wait.call_count == 3 # 3 retries, 1 sleep each + assert sleep_or_wait.call_count >= 3 # 3 retries, 1 sleep each + assert sleep_or_wait.call_count <= 4 # 3 retries, 1 final success (delay=None), not more + if sleep_or_wait.call_count > 3: + sleep_or_wait.call_args_list[-1].args[0] is None diff --git a/tests/handling/test_delays.py b/tests/handling/test_delays.py index b1c1c246..dc9fda22 100644 --- a/tests/handling/test_delays.py +++ b/tests/handling/test_delays.py @@ -7,8 +7,7 @@ import kopf from kopf.reactor.handling import TemporaryError -from kopf.reactor.handling import WAITING_KEEPALIVE_INTERVAL -from kopf.reactor.processing import process_resource_event +from kopf.reactor.processing import process_resource_event, WAITING_KEEPALIVE_INTERVAL from kopf.storage.finalizers import FINALIZER from kopf.storage.states import HandlerState from kopf.structs.containers import ResourceMemories @@ -114,5 +113,5 @@ async def test_delayed_handlers_sleep( assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == delay assert_logs([ - r"Sleeping for [\d\.]+ seconds", + r"Sleeping for ([\d\.]+|[\d\.]+ \(capped [\d\.]+\)) seconds", ]) diff --git a/tests/test_sleeping.py b/tests/test_sleeping.py index 0014f207..b9628dcf 100644 --- a/tests/test_sleeping.py +++ b/tests/test_sleeping.py @@ -1,26 +1,59 @@ import asyncio +import pytest from kopf.engines.sleeping import sleep_or_wait -async def test_sleep_or_wait_by_delay_reached(timer): - event = asyncio.Event() +async def test_the_only_delay_is_awaited(timer): with timer: - unslept = await asyncio.wait_for(sleep_or_wait(0.10, event), timeout=1.0) + unslept = await asyncio.wait_for(sleep_or_wait(0.10), timeout=1.0) + assert 0.10 <= timer.seconds < 0.11 + assert unslept is None + + +async def test_the_shortest_delay_is_awaited(timer): + with timer: + unslept = await asyncio.wait_for(sleep_or_wait([0.10, 0.20]), timeout=1.0) + assert 0.10 <= timer.seconds < 0.11 + assert unslept is None + + +async def test_specific_delays_only_are_awaited(timer): + with timer: + unslept = await asyncio.wait_for(sleep_or_wait([0.10, None]), timeout=1.0) assert 0.10 <= timer.seconds < 0.11 assert unslept is None -async def test_sleep_or_wait_by_event_set(timer): +async def test_passed_delays_skip_sleeping(timer): + with timer: + unslept = await asyncio.wait_for(sleep_or_wait([0.10, -10]), timeout=1.0) + assert timer.seconds < 0.01 + assert unslept is None + + +@pytest.mark.parametrize('delays', [ + pytest.param([], id='empty-list'), + pytest.param([None], id='list-of-none'), +]) +async def test_no_delays_skip_sleeping(timer, delays): + with timer: + unslept = await asyncio.wait_for(sleep_or_wait(delays), timeout=1.0) + assert timer.seconds < 0.01 + assert unslept is None + + +async def test_by_event_set_before_time_comes(timer): event = asyncio.Event() asyncio.get_running_loop().call_later(0.07, event.set) with timer: unslept = await asyncio.wait_for(sleep_or_wait(0.10, event), timeout=1.0) + assert unslept is not None + assert 0.02 <= unslept <= 0.04 assert 0.06 <= timer.seconds <= 0.08 - assert 0.02 <= unslept <= 0.04 -async def test_sleep_or_wait_with_zero_time_and_event_cleared(timer): +async def test_with_zero_time_and_event_initially_cleared(timer): event = asyncio.Event() event.clear() with timer: @@ -29,7 +62,7 @@ async def test_sleep_or_wait_with_zero_time_and_event_cleared(timer): assert unslept is None -async def test_sleep_or_wait_with_zero_time_and_event_preset(timer): +async def test_with_zero_time_and_event_initially_set(timer): event = asyncio.Event() event.set() with timer: From f7cdc6890c0767b43fe6bb1b381c93c4eaecedec Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 19:50:39 +0100 Subject: [PATCH 02/20] Deprecate `cause=` kwarg in the handler functions --- examples/05-handlers/example.py | 4 ++-- kopf/on.py | 22 +++++++++++++++++++++- kopf/reactor/invocation.py | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/05-handlers/example.py b/examples/05-handlers/example.py index e33a3d93..e4c1dff4 100644 --- a/examples/05-handlers/example.py +++ b/examples/05-handlers/example.py @@ -2,7 +2,7 @@ @kopf.on.resume('zalando.org', 'v1', 'kopfexamples') -def resume_fn_1(cause, **kwargs): +def resume_fn_1(**kwargs): print(f'RESUMED 1st') @@ -12,7 +12,7 @@ def create_fn_1(**kwargs): @kopf.on.resume('zalando.org', 'v1', 'kopfexamples') -def resume_fn_2(cause, **kwargs): +def resume_fn_2(**kwargs): print(f'RESUMED 2nd') diff --git a/kopf/on.py b/kopf/on.py index 0f1da66b..41176dee 100644 --- a/kopf/on.py +++ b/kopf/on.py @@ -11,9 +11,10 @@ def creation_handler(**kwargs): """ # TODO: add cluster=True support (different API methods) +import inspect import warnings -from typing import Optional, Callable +from typing import Optional, Callable, Any from kopf.reactor import handling from kopf.reactor import registries @@ -39,6 +40,7 @@ def startup( # lgtm[py/similar-function] registry: Optional[registries.OperatorRegistry] = None, ) -> ActivityDecorator: def decorator(fn: callbacks.ActivityFn) -> callbacks.ActivityFn: + _warn_deprecated_signatures(fn) real_registry = registry if registry is not None else registries.get_default_registry() real_id = registries.generate_id(fn=fn, id=id) handler = handlers.ActivityHandler( @@ -62,6 +64,7 @@ def cleanup( # lgtm[py/similar-function] registry: Optional[registries.OperatorRegistry] = None, ) -> ActivityDecorator: def decorator(fn: callbacks.ActivityFn) -> callbacks.ActivityFn: + _warn_deprecated_signatures(fn) real_registry = registry if registry is not None else registries.get_default_registry() real_id = registries.generate_id(fn=fn, id=id) handler = handlers.ActivityHandler( @@ -86,6 +89,7 @@ def login( # lgtm[py/similar-function] ) -> ActivityDecorator: """ ``@kopf.on.login()`` handler for custom (re-)authentication. """ def decorator(fn: callbacks.ActivityFn) -> callbacks.ActivityFn: + _warn_deprecated_signatures(fn) real_registry = registry if registry is not None else registries.get_default_registry() real_id = registries.generate_id(fn=fn, id=id) handler = handlers.ActivityHandler( @@ -110,6 +114,7 @@ def probe( # lgtm[py/similar-function] ) -> ActivityDecorator: """ ``@kopf.on.probe()`` handler for arbitrary liveness metrics. """ def decorator(fn: callbacks.ActivityFn) -> callbacks.ActivityFn: + _warn_deprecated_signatures(fn) real_registry = registry if registry is not None else registries.get_default_registry() real_id = registries.generate_id(fn=fn, id=id) handler = handlers.ActivityHandler( @@ -139,6 +144,7 @@ def resume( # lgtm[py/similar-function] ) -> ResourceChangingDecorator: """ ``@kopf.on.resume()`` handler for the object resuming on operator (re)start. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -171,6 +177,7 @@ def create( # lgtm[py/similar-function] ) -> ResourceChangingDecorator: """ ``@kopf.on.create()`` handler for the object creation. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -203,6 +210,7 @@ def update( # lgtm[py/similar-function] ) -> ResourceChangingDecorator: """ ``@kopf.on.update()`` handler for the object update or change. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -236,6 +244,7 @@ def delete( # lgtm[py/similar-function] ) -> ResourceChangingDecorator: """ ``@kopf.on.delete()`` handler for the object deletion. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -269,6 +278,7 @@ def field( # lgtm[py/similar-function] ) -> ResourceChangingDecorator: """ ``@kopf.on.field()`` handler for the individual field changes. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -297,6 +307,7 @@ def event( # lgtm[py/similar-function] ) -> ResourceWatchingDecorator: """ ``@kopf.on.event()`` handler for the silent spies on the events. """ def decorator(fn: callbacks.ResourceWatchingFn) -> callbacks.ResourceWatchingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) real_registry = registry if registry is not None else registries.get_default_registry() real_resource = resources.Resource(group, version, plural) @@ -356,6 +367,7 @@ def create_task(*, spec, task=task, **kwargs): create function will have its own value, not the latest in the for-cycle. """ def decorator(fn: callbacks.ResourceChangingFn) -> callbacks.ResourceChangingFn: + _warn_deprecated_signatures(fn) _warn_deprecated_filters(labels, annotations) parent_handler = handling.handler_var.get() real_registry = registry if registry is not None else handling.subregistry_var.get() @@ -419,6 +431,14 @@ def create_single_task(task=task, **_): return decorator(fn) +def _warn_deprecated_signatures( + fn: Callable[..., Any], +) -> None: + argspec = inspect.getfullargspec(fn) + if 'cause' in argspec.args or 'cause' in argspec.kwonlyargs: + warnings.warn("`cause` kwarg is deprecated; use kwargs directly.", DeprecationWarning) + + def _warn_deprecated_filters( labels: Optional[filters.MetaFilter], annotations: Optional[filters.MetaFilter], diff --git a/kopf/reactor/invocation.py b/kopf/reactor/invocation.py index c1073e3b..49955600 100644 --- a/kopf/reactor/invocation.py +++ b/kopf/reactor/invocation.py @@ -52,7 +52,7 @@ def build_kwargs( # Add aliases for the kwargs, directly linked to the body, or to the assumed defaults. if isinstance(cause, causation.BaseCause): new_kwargs.update( - cause=cause, + cause=cause, # DEPRECATED: An internal container for kwargs - do not expose directly. logger=cause.logger, ) if isinstance(cause, causation.ActivityCause): From b1470dfaa254ab7f0120ff088aca88c6ff2a2167 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 20:36:41 +0100 Subject: [PATCH 03/20] Return multiple delays from cause-processing routines --- kopf/reactor/processing.py | 22 ++++++++++++---------- kopf/storage/states.py | 25 +++++++++++++++++-------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index 52ea7e0f..1d2171a3 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -15,7 +15,7 @@ """ import asyncio import datetime -from typing import Optional +from typing import Collection, Optional from kopf.clients import patching from kopf.engines import logging as logging_engine @@ -99,6 +99,7 @@ async def process_resource_event( # Object patch accumulator. Populated by the methods. Applied in the end of the handler. # Detect the cause and handle it (or at least log this happened). + delays: Collection[float] = [] if registry.resource_changing_handlers[resource]: extra_fields = registry.resource_changing_handlers[resource].get_extra_fields() old = settings.persistence.diffbase_storage.fetch(body=body) @@ -118,7 +119,7 @@ async def process_resource_event( memo=memory.user_data, initial=memory.noticed_by_listing and not memory.fully_handled_once, ) - delay = await process_resource_changing_cause( + delays = await process_resource_changing_cause( lifecycle=lifecycle, registry=registry, settings=settings, @@ -128,6 +129,7 @@ async def process_resource_event( # Whatever was done, apply the accumulated changes to the object. # But only once, to reduce the number of API calls and the generated irrelevant events. + delay = min(delays) if delays else None if patch: logger.debug("Patching with: %r", patch) await patching.patch_obj(resource=resource, patch=patch, body=body) @@ -197,16 +199,16 @@ async def process_resource_changing_cause( settings: configuration.OperatorSettings, memory: containers.ResourceMemory, cause: causation.ResourceChangingCause, -) -> Optional[float]: +) -> Collection[float]: """ Handle a detected cause, as part of the bigger handler routine. """ logger = cause.logger patch = cause.patch # TODO get rid of this alias body = cause.body # TODO get rid of this alias - delay = None - done = None - skip = None + delays: Collection[float] = [] + done: Optional[bool] = None + skip: Optional[bool] = None resource_changing_handlers = registry.resource_changing_handlers[cause.resource] deletion_must_be_blocked = resource_changing_handlers.requires_finalizer(cause=cause) @@ -215,12 +217,12 @@ async def process_resource_changing_cause( if deletion_must_be_blocked and not deletion_is_blocked: logger.debug("Adding the finalizer, thus preventing the actual deletion.") finalizers.block_deletion(body=body, patch=patch) - return None + return () if not deletion_must_be_blocked and deletion_is_blocked: logger.debug("Removing the finalizer, as there are no handlers requiring it.") finalizers.allow_deletion(body=body, patch=patch) - return None + return () # Regular causes invoke the handlers. if cause.reason in handlers_.HANDLER_REASONS: @@ -249,7 +251,7 @@ async def process_resource_changing_cause( state.purge(body=cause.body, patch=cause.patch, storage=storage) done = state.done - delay = state.delay + delays = state.delays else: skip = True @@ -276,4 +278,4 @@ async def process_resource_changing_cause( logger.debug("Something has changed, but we are not interested (the essence is the same).") # The delay is then consumed by the main handling routine (in different ways). - return delay + return delays diff --git a/kopf/storage/states.py b/kopf/storage/states.py index 833903fd..47f3ec38 100644 --- a/kopf/storage/states.py +++ b/kopf/storage/states.py @@ -230,15 +230,24 @@ def done(self) -> bool: @property def delay(self) -> Optional[float]: + delays = self.delays # calculate only once, to save bit of CPU + return min(delays) if delays else None + + @property + def delays(self) -> Collection[float]: + """ + Resulting delays for the handlers (only the postponed ones). + + The delays are then reduced to one single sleep in the top-level + processing routine, based on all delays of different origin: + e.g. postponed daemons, stopping daemons, temporarily failed handlers. + """ now = datetime.datetime.utcnow() - state_times = [handler_state.delayed for handler_state in self._states.values()] - clean_times = [t for t in state_times if t is not None] - if clean_times: - until = min(clean_times) # the soonest awake datetime. - delay = (until - now).total_seconds() - return max(0, delay) - else: - return None + return [ + max(0, (handler_state.delayed - now).total_seconds()) + for handler_state in self._states.values() + if handler_state.delayed + ] def deliver_results( From 1ca679229358d98289f96ada071f7b8636c4da45 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 20:40:29 +0100 Subject: [PATCH 04/20] Extract patch-and-sleep step to a separate reusable coroutine --- kopf/reactor/processing.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index 1d2171a3..56a66f82 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -127,9 +127,25 @@ async def process_resource_event( cause=resource_changing_cause, ) - # Whatever was done, apply the accumulated changes to the object. + # Whatever was done, apply the accumulated changes to the object, or sleep-n-touch for delays. # But only once, to reduce the number of API calls and the generated irrelevant events. + await apply_reaction_outcomes( + resource=resource, body=body, + patch=patch, delays=delays, + logger=logger, replenished=replenished) + + +async def apply_reaction_outcomes( + *, + resource: resources.Resource, + body: bodies.Body, + patch: patches.Patch, + delays: Collection[float], + logger: logging_engine.ObjectLogger, + replenished: asyncio.Event, +) -> None: delay = min(delays) if delays else None + if patch: logger.debug("Patching with: %r", patch) await patching.patch_obj(resource=resource, patch=patch, body=body) From 1671af24c40ad070ad0a9f32c3528cdb93070e99 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Thu, 12 Mar 2020 20:53:15 +0100 Subject: [PATCH 05/20] Omit the patch-and-sleep step for the objects known to be gone --- kopf/reactor/processing.py | 10 ++++++---- tests/handling/test_cause_handling.py | 11 +++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index 56a66f82..a2dec25b 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -129,10 +129,12 @@ async def process_resource_event( # Whatever was done, apply the accumulated changes to the object, or sleep-n-touch for delays. # But only once, to reduce the number of API calls and the generated irrelevant events. - await apply_reaction_outcomes( - resource=resource, body=body, - patch=patch, delays=delays, - logger=logger, replenished=replenished) + # And only of the object is at least supposed to exist (not "GONE"), even if actually does not. + if raw_event['type'] != 'DELETED': + await apply_reaction_outcomes( + resource=resource, body=body, + patch=patch, delays=delays, + logger=logger, replenished=replenished) async def apply_reaction_outcomes( diff --git a/tests/handling/test_cause_handling.py b/tests/handling/test_cause_handling.py index 0d8a5635..15444e47 100644 --- a/tests/handling/test_cause_handling.py +++ b/tests/handling/test_cause_handling.py @@ -6,13 +6,15 @@ import kopf from kopf.reactor.processing import process_resource_event from kopf.storage.diffbase import LAST_SEEN_ANNOTATION +from kopf.storage.finalizers import FINALIZER from kopf.structs.containers import ResourceMemories from kopf.structs.handlers import Reason EVENT_TYPES = [None, 'ADDED', 'MODIFIED', 'DELETED'] +EVENT_TYPES_WHEN_EXISTS = [None, 'ADDED', 'MODIFIED'] -@pytest.mark.parametrize('event_type', EVENT_TYPES) +@pytest.mark.parametrize('event_type', EVENT_TYPES_WHEN_EXISTS) async def test_create(registry, settings, handlers, resource, cause_mock, event_type, caplog, assert_logs, k8s_mocked): caplog.set_level(logging.DEBUG) @@ -54,7 +56,7 @@ async def test_create(registry, settings, handlers, resource, cause_mock, event_ ]) -@pytest.mark.parametrize('event_type', EVENT_TYPES) +@pytest.mark.parametrize('event_type', EVENT_TYPES_WHEN_EXISTS) async def test_update(registry, settings, handlers, resource, cause_mock, event_type, caplog, assert_logs, k8s_mocked): caplog.set_level(logging.DEBUG) @@ -96,11 +98,12 @@ async def test_update(registry, settings, handlers, resource, cause_mock, event_ ]) -@pytest.mark.parametrize('event_type', EVENT_TYPES) +@pytest.mark.parametrize('event_type', EVENT_TYPES_WHEN_EXISTS) async def test_delete(registry, settings, handlers, resource, cause_mock, event_type, caplog, assert_logs, k8s_mocked): caplog.set_level(logging.DEBUG) cause_mock.reason = Reason.DELETE + event_body = {'metadata': {'deletionTimestamp': '...', 'finalizers': [FINALIZER]}} event_queue = asyncio.Queue() await process_resource_event( @@ -109,7 +112,7 @@ async def test_delete(registry, settings, handlers, resource, cause_mock, event_ settings=settings, resource=resource, memories=ResourceMemories(), - raw_event={'type': event_type, 'object': {}}, + raw_event={'type': event_type, 'object': event_body}, replenished=asyncio.Event(), event_queue=event_queue, ) From 8514b2d005bc2b23d8528aa86e20aba445ce74ef Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 20:44:28 +0100 Subject: [PATCH 06/20] Refactor cause detection and processing (keep the logic) --- kopf/reactor/processing.py | 70 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index a2dec25b..4d2fd190 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -79,47 +79,50 @@ async def process_resource_event( if raw_event['type'] == 'DELETED': await memories.forget(body) - # Invoke all silent spies. No causation, no progress storage is performed. - if registry.resource_watching_handlers[resource]: - resource_watching_cause = causation.detect_resource_watching_cause( - raw_event=raw_event, - resource=resource, - logger=logger, - patch=patch, - body=body, - memo=memory.user_data, - ) + extra_fields = registry.resource_changing_handlers[resource].get_extra_fields() + old = settings.persistence.diffbase_storage.fetch(body=body) + new = settings.persistence.diffbase_storage.build(body=body, extra_fields=extra_fields) + old = settings.persistence.progress_storage.clear(essence=old) if old is not None else None + new = settings.persistence.progress_storage.clear(essence=new) if new is not None else None + diff = diffs.diff(old, new) + + # Detect what are we going to do on this processing cycle. + resource_watching_cause = causation.detect_resource_watching_cause( + raw_event=raw_event, + resource=resource, + logger=logger, + patch=patch, + body=body, + memo=memory.user_data, + ) if registry.resource_watching_handlers[resource] else None + + resource_changing_cause = causation.detect_resource_changing_cause( + raw_event=raw_event, + resource=resource, + logger=logger, + patch=patch, + body=body, + old=old, + new=new, + diff=diff, + memo=memory.user_data, + initial=memory.noticed_by_listing and not memory.fully_handled_once, + ) if registry.resource_changing_handlers[resource] else None + + # Invoke all the handlers that should or could be invoked at this processing cycle. + if resource_watching_cause is not None: await process_resource_watching_cause( lifecycle=lifecycles.all_at_once, registry=registry, settings=settings, - memory=memory, cause=resource_watching_cause, ) # Object patch accumulator. Populated by the methods. Applied in the end of the handler. # Detect the cause and handle it (or at least log this happened). - delays: Collection[float] = [] - if registry.resource_changing_handlers[resource]: - extra_fields = registry.resource_changing_handlers[resource].get_extra_fields() - old = settings.persistence.diffbase_storage.fetch(body=body) - new = settings.persistence.diffbase_storage.build(body=body, extra_fields=extra_fields) - old = settings.persistence.progress_storage.clear(essence=old) if old is not None else None - new = settings.persistence.progress_storage.clear(essence=new) if new is not None else None - diff = diffs.diff(old, new) - resource_changing_cause = causation.detect_resource_changing_cause( - raw_event=raw_event, - resource=resource, - logger=logger, - patch=patch, - body=body, - old=old, - new=new, - diff=diff, - memo=memory.user_data, - initial=memory.noticed_by_listing and not memory.fully_handled_once, - ) - delays = await process_resource_changing_cause( + resource_changing_delays: Collection[float] = [] + if resource_changing_cause is not None: + resource_changing_delays = await process_resource_changing_cause( lifecycle=lifecycle, registry=registry, settings=settings, @@ -133,7 +136,7 @@ async def process_resource_event( if raw_event['type'] != 'DELETED': await apply_reaction_outcomes( resource=resource, body=body, - patch=patch, delays=delays, + patch=patch, delays=resource_changing_delays, logger=logger, replenished=replenished) @@ -184,7 +187,6 @@ async def process_resource_watching_cause( lifecycle: lifecycles.LifeCycleFn, registry: registries.OperatorRegistry, settings: configuration.OperatorSettings, - memory: containers.ResourceMemory, cause: causation.ResourceWatchingCause, ) -> None: """ From 7adba026d949f9fe8d3b1420e296161b5eb207c3 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 20:46:46 +0100 Subject: [PATCH 07/20] Move finalizer addition/removal to the top-level processing --- kopf/reactor/processing.py | 43 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index 4d2fd190..cbbcd38d 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -109,6 +109,26 @@ async def process_resource_event( initial=memory.noticed_by_listing and not memory.fully_handled_once, ) if registry.resource_changing_handlers[resource] else None + # Block the object from deletion if we have anything to do in its end of life: + # specifically, if there are daemons to kill or mandatory on-deletion handlers to call. + # The high-level handlers are prevented if this event cycle is dedicated to the finalizer. + # The low-level handlers (on-event spying & daemon spawning) are still executed asap. + deletion_is_ongoing = finalizers.is_deletion_ongoing(body=body) + deletion_is_blocked = finalizers.is_deletion_blocked(body=body) + deletion_must_be_blocked = ( + (resource_changing_cause is not None and + registry.resource_changing_handlers[resource].requires_finalizer(resource_changing_cause))) + + if deletion_must_be_blocked and not deletion_is_blocked and not deletion_is_ongoing: + logger.debug("Adding the finalizer, thus preventing the actual deletion.") + finalizers.block_deletion(body=body, patch=patch) + resource_changing_cause = None # prevent further high-level processing this time + + if not deletion_must_be_blocked and deletion_is_blocked: + logger.debug("Removing the finalizer, as there are no handlers requiring it.") + finalizers.allow_deletion(body=body, patch=patch) + resource_changing_cause = None # prevent further high-level processing this time + # Invoke all the handlers that should or could be invoked at this processing cycle. if resource_watching_cause is not None: await process_resource_watching_cause( @@ -130,6 +150,12 @@ async def process_resource_event( cause=resource_changing_cause, ) + # Release the object if everything is done, and it is marked for deletion. + # But not when it has already gone. + if deletion_is_ongoing and deletion_is_blocked and not resource_changing_delays: + logger.debug("Removing the finalizer, thus allowing the actual deletion.") + finalizers.allow_deletion(body=body, patch=patch) + # Whatever was done, apply the accumulated changes to the object, or sleep-n-touch for delays. # But only once, to reduce the number of API calls and the generated irrelevant events. # And only of the object is at least supposed to exist (not "GONE"), even if actually does not. @@ -230,20 +256,6 @@ async def process_resource_changing_cause( done: Optional[bool] = None skip: Optional[bool] = None - resource_changing_handlers = registry.resource_changing_handlers[cause.resource] - deletion_must_be_blocked = resource_changing_handlers.requires_finalizer(cause=cause) - deletion_is_blocked = finalizers.is_deletion_blocked(body=cause.body) - - if deletion_must_be_blocked and not deletion_is_blocked: - logger.debug("Adding the finalizer, thus preventing the actual deletion.") - finalizers.block_deletion(body=body, patch=patch) - return () - - if not deletion_must_be_blocked and deletion_is_blocked: - logger.debug("Removing the finalizer, as there are no handlers requiring it.") - finalizers.allow_deletion(body=body, patch=patch) - return () - # Regular causes invoke the handlers. if cause.reason in handlers_.HANDLER_REASONS: title = handlers_.TITLES.get(cause.reason, repr(cause.reason)) @@ -279,9 +291,6 @@ async def process_resource_changing_cause( if done or skip: if cause.new is not None and cause.old != cause.new: settings.persistence.diffbase_storage.store(body=body, patch=patch, essence=cause.new) - if cause.reason == handlers_.Reason.DELETE: - logger.debug("Removing the finalizer, thus allowing the actual deletion.") - finalizers.allow_deletion(body=body, patch=patch) # Once all handlers have succeeded at least once for any reason, or if there were none, # prevent further resume-handlers (which otherwise happens on each watch-stream re-listing). From 524b6ef7dcf0620f8452c3e449bac9a942b4426e Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Mon, 16 Mar 2020 01:40:10 +0100 Subject: [PATCH 08/20] Customise how the handlers are self-represented in logs In the assumption that new handler types are coming (daemons, timers), and they must be rendered differently, not as just "handlers", while using the same invocation routines. --- kopf/reactor/handling.py | 20 ++++++++++---------- kopf/structs/handlers.py | 7 +++++++ tests/e2e/test_examples.py | 4 ++-- tests/handling/test_cause_handling.py | 6 +++--- tests/handling/test_delays.py | 2 +- tests/handling/test_event_handling.py | 8 ++++---- 6 files changed, 27 insertions(+), 20 deletions(-) diff --git a/kopf/reactor/handling.py b/kopf/reactor/handling.py index 509ff30e..eb6588e2 100644 --- a/kopf/reactor/handling.py +++ b/kopf/reactor/handling.py @@ -241,13 +241,13 @@ async def execute_handler_once( # The exceptions are handled locally and are not re-raised, to keep the operator running. try: - logger.debug(f"Invoking handler {handler.id!r}.") + logger.debug(f"{handler} is invoked.") if handler.timeout is not None and state.runtime.total_seconds() >= handler.timeout: - raise HandlerTimeoutError(f"Handler {handler.id!r} has timed out after {state.runtime}.") + raise HandlerTimeoutError(f"{handler} has timed out after {state.runtime}.") if handler.retries is not None and state.retries >= handler.retries: - raise HandlerRetriesError(f"Handler {handler.id!r} has exceeded {state.retries} retries.") + raise HandlerRetriesError(f"{handler} has exceeded {state.retries} retries.") result = await invoke_handler( handler, @@ -261,12 +261,12 @@ async def execute_handler_once( # Unfinished children cause the regular retry, but with less logging and event reporting. except HandlerChildrenRetry as e: - logger.debug(f"Handler {handler.id!r} has unfinished sub-handlers. Will retry soon.") + logger.debug(f"{handler} has unfinished sub-handlers. Will retry soon.") return states.HandlerOutcome(final=False, exception=e, delay=e.delay) # Definitely a temporary error, regardless of the error strictness. except TemporaryError as e: - logger.error(f"Handler {handler.id!r} failed temporarily: %s", str(e) or repr(e)) + logger.error(f"{handler} failed temporarily: %s", str(e) or repr(e)) return states.HandlerOutcome(final=False, exception=e, delay=e.delay) # Same as permanent errors below, but with better logging for our internal cases. @@ -277,20 +277,20 @@ async def execute_handler_once( # Definitely a permanent error, regardless of the error strictness. except PermanentError as e: - logger.error(f"Handler {handler.id!r} failed permanently: %s", str(e) or repr(e)) + logger.error(f"{handler} failed permanently: %s", str(e) or repr(e)) return states.HandlerOutcome(final=True, exception=e) # TODO: report the handling failure somehow (beside logs/events). persistent status? # Regular errors behave as either temporary or permanent depending on the error strictness. except Exception as e: if errors_mode == handlers_.ErrorsMode.IGNORED: - logger.exception(f"Handler {handler.id!r} failed with an exception. Will ignore.") + logger.exception(f"{handler} failed with an exception. Will ignore.") return states.HandlerOutcome(final=True) elif errors_mode == handlers_.ErrorsMode.TEMPORARY: - logger.exception(f"Handler {handler.id!r} failed with an exception. Will retry.") + logger.exception(f"{handler} failed with an exception. Will retry.") return states.HandlerOutcome(final=False, exception=e, delay=backoff) elif errors_mode == handlers_.ErrorsMode.PERMANENT: - logger.exception(f"Handler {handler.id!r} failed with an exception. Will stop.") + logger.exception(f"{handler} failed with an exception. Will stop.") return states.HandlerOutcome(final=True, exception=e) # TODO: report the handling failure somehow (beside logs/events). persistent status? else: @@ -298,7 +298,7 @@ async def execute_handler_once( # No errors means the handler should be excluded from future runs in this reaction cycle. else: - logger.info(f"Handler {handler.id!r} succeeded.") + logger.info(f"{handler} succeeded.") return states.HandlerOutcome(final=True, result=result) diff --git a/kopf/structs/handlers.py b/kopf/structs/handlers.py index ff754f5c..fb314799 100644 --- a/kopf/structs/handlers.py +++ b/kopf/structs/handlers.py @@ -94,6 +94,10 @@ def __getattribute__(self, name: str) -> Any: else: return super().__getattribute__(name) + # Used in the logs. Overridden in some (but not all) handler types for better log messages. + def __str__(self) -> str: + return f"Handler {self.id!r}" + @dataclasses.dataclass class ActivityHandler(BaseHandler): @@ -101,6 +105,9 @@ class ActivityHandler(BaseHandler): activity: Optional[Activity] _fallback: bool = False # non-public! + def __str__(self) -> str: + return f"Activity {self.id!r}" + @dataclasses.dataclass class ResourceHandler(BaseHandler): diff --git a/tests/e2e/test_examples.py b/tests/e2e/test_examples.py index fa10885b..36715781 100644 --- a/tests/e2e/test_examples.py +++ b/tests/e2e/test_examples.py @@ -92,7 +92,7 @@ def test_all_examples_are_runnable(mocker, settings, with_crd, exampledir, caplo assert 'Traceback (most recent call last):' not in runner.stdout # Verify that once a handler succeeds, it is never re-executed again. - handler_names = re.findall(r"Handler '(.+?)' succeeded", runner.stdout) + handler_names = re.findall(r"'(.+?)' succeeded", runner.stdout) if e2e_success_counts is not None: checked_names = [name for name in handler_names if name in e2e_success_counts] name_counts = collections.Counter(checked_names) @@ -102,7 +102,7 @@ def test_all_examples_are_runnable(mocker, settings, with_crd, exampledir, caplo assert set(name_counts.values()) == {1} # Verify that once a handler fails, it is never re-executed again. - handler_names = re.findall(r"Handler '(.+?)' failed (?:permanently|with an exception. Will stop.)", runner.stdout) + handler_names = re.findall(r"'(.+?)' failed (?:permanently|with an exception. Will stop.)", runner.stdout) if e2e_failure_counts is not None: checked_names = [name for name in handler_names if name in e2e_failure_counts] name_counts = collections.Counter(checked_names) diff --git a/tests/handling/test_cause_handling.py b/tests/handling/test_cause_handling.py index 15444e47..b759d918 100644 --- a/tests/handling/test_cause_handling.py +++ b/tests/handling/test_cause_handling.py @@ -49,7 +49,7 @@ async def test_create(registry, settings, handlers, resource, cause_mock, event_ assert_logs([ "Creation event:", - "Invoking handler 'create_fn'", + "Handler 'create_fn' is invoked", "Handler 'create_fn' succeeded", "All handlers succeeded", "Patching with", @@ -91,7 +91,7 @@ async def test_update(registry, settings, handlers, resource, cause_mock, event_ assert_logs([ "Update event:", - "Invoking handler 'update_fn'", + "Handler 'update_fn' is invoked", "Handler 'update_fn' succeeded", "All handlers succeeded", "Patching with", @@ -131,7 +131,7 @@ async def test_delete(registry, settings, handlers, resource, cause_mock, event_ assert_logs([ "Deletion event", - "Invoking handler 'delete_fn'", + "Handler 'delete_fn' is invoked", "Handler 'delete_fn' succeeded", "All handlers succeeded", "Removing the finalizer", diff --git a/tests/handling/test_delays.py b/tests/handling/test_delays.py index dc9fda22..bc95fca2 100644 --- a/tests/handling/test_delays.py +++ b/tests/handling/test_delays.py @@ -56,7 +56,7 @@ async def test_delayed_handlers_progress( assert patch['status']['kopf']['progress'][fname]['delayed'] == delayed_iso assert_logs([ - "Invoking handler .+", + "Handler .+ is invoked", "Handler .+ failed temporarily: oops", ]) diff --git a/tests/handling/test_event_handling.py b/tests/handling/test_event_handling.py index bebbe855..0b3f0a25 100644 --- a/tests/handling/test_event_handling.py +++ b/tests/handling/test_event_handling.py @@ -36,9 +36,9 @@ async def test_handlers_called_always( assert event['type'] == 'ev-type' assert_logs([ - "Invoking handler 'event_fn'.", + "Handler 'event_fn' is invoked.", "Handler 'event_fn' succeeded.", - "Invoking handler 'event_fn2'.", + "Handler 'event_fn2' is invoked.", "Handler 'event_fn2' succeeded.", ]) @@ -66,8 +66,8 @@ async def test_errors_are_ignored( assert extrahandlers.event_mock.called assert_logs([ - "Invoking handler 'event_fn'.", + "Handler 'event_fn' is invoked.", "Handler 'event_fn' failed with an exception. Will ignore.", - "Invoking handler 'event_fn2'.", + "Handler 'event_fn2' is invoked.", "Handler 'event_fn2' succeeded.", ]) From 99a32d42de70979d494b075fbae9b5cfa11e6b80 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 22:02:40 +0100 Subject: [PATCH 09/20] Add @kopf.daemon & @kopf.timer for background tasks --- kopf/__init__.py | 10 +- kopf/engines/sleeping.py | 9 +- kopf/on.py | 78 +++++++ kopf/reactor/causation.py | 43 ++++ kopf/reactor/daemons.py | 404 +++++++++++++++++++++++++++++++++++++ kopf/reactor/handling.py | 20 +- kopf/reactor/invocation.py | 10 +- kopf/reactor/processing.py | 101 ++++++++-- kopf/reactor/registries.py | 19 ++ kopf/reactor/running.py | 10 + kopf/structs/callbacks.py | 57 ++++++ kopf/structs/containers.py | 52 ++++- kopf/structs/handlers.py | 28 +++ kopf/structs/primitives.py | 131 +++++++++++- 14 files changed, 941 insertions(+), 31 deletions(-) create mode 100644 kopf/reactor/daemons.py diff --git a/kopf/__init__.py b/kopf/__init__.py index 358e670c..6dcbd60e 100644 --- a/kopf/__init__.py +++ b/kopf/__init__.py @@ -26,6 +26,10 @@ warn, exception, ) +from kopf.on import ( + daemon, + timer, +) from kopf.on import ( register, ) @@ -91,6 +95,9 @@ from kopf.structs.handlers import ( ErrorsMode, ) +from kopf.structs.primitives import ( + DaemonStoppingReason, +) from kopf.toolkits.hierarchies import ( adopt, label, @@ -113,7 +120,7 @@ HandlerRetryError = TemporaryError # a backward-compatibility alias __all__ = [ - 'on', 'lifecycles', 'register', 'execute', + 'on', 'lifecycles', 'register', 'execute', 'daemon', 'timer', 'configure', 'login', 'LoginError', 'ConnectionInfo', 'login_via_pykube', 'login_via_client', @@ -149,4 +156,5 @@ 'StatusProgressStorage', 'MultiProgressStorage', 'SmartProgressStorage', + 'DaemonStoppingReason', ] diff --git a/kopf/engines/sleeping.py b/kopf/engines/sleeping.py index 5996c4e5..40d726ff 100644 --- a/kopf/engines/sleeping.py +++ b/kopf/engines/sleeping.py @@ -5,10 +5,12 @@ import collections from typing import Optional, Collection, Union +from kopf.structs import primitives + async def sleep_or_wait( delays: Union[None, float, Collection[Union[None, float]]], - event: Optional[asyncio.Event] = None, + wakeup: Optional[Union[asyncio.Event, primitives.DaemonStopper]] = None, ) -> Optional[float]: """ Measure the sleep time: either until the timeout, or until the event is set. @@ -22,7 +24,10 @@ async def sleep_or_wait( actual_delays = [delay for delay in passed_delays if delay is not None] minimal_delay = min(actual_delays) if actual_delays else 0 - awakening_event = event if event is not None else asyncio.Event() + awakening_event = ( + wakeup.async_event if isinstance(wakeup, primitives.DaemonStopper) else + wakeup if wakeup is not None else + asyncio.Event()) loop = asyncio.get_running_loop() try: diff --git a/kopf/on.py b/kopf/on.py index 41176dee..0d9fe8ca 100644 --- a/kopf/on.py +++ b/kopf/on.py @@ -27,6 +27,8 @@ def creation_handler(**kwargs): ActivityDecorator = Callable[[callbacks.ActivityFn], callbacks.ActivityFn] ResourceWatchingDecorator = Callable[[callbacks.ResourceWatchingFn], callbacks.ResourceWatchingFn] ResourceChangingDecorator = Callable[[callbacks.ResourceChangingFn], callbacks.ResourceChangingFn] +ResourceDaemonDecorator = Callable[[callbacks.ResourceDaemonFn], callbacks.ResourceDaemonFn] +ResourceTimerDecorator = Callable[[callbacks.ResourceTimerFn], callbacks.ResourceTimerFn] def startup( # lgtm[py/similar-function] @@ -322,6 +324,82 @@ def decorator(fn: callbacks.ResourceWatchingFn) -> callbacks.ResourceWatchingFn: return decorator +def daemon( # lgtm[py/similar-function] + group: str, version: str, plural: str, + *, + id: Optional[str] = None, + errors: Optional[handlers.ErrorsMode] = None, + timeout: Optional[float] = None, + retries: Optional[int] = None, + backoff: Optional[float] = None, + cooldown: Optional[float] = None, # deprecated, use `backoff` + registry: Optional[registries.OperatorRegistry] = None, + labels: Optional[filters.MetaFilter] = None, + annotations: Optional[filters.MetaFilter] = None, + when: Optional[callbacks.WhenFilterFn] = None, + initial_backoff: Optional[float] = None, + cancellation_backoff: Optional[float] = None, + cancellation_timeout: Optional[float] = None, + cancellation_polling: Optional[float] = None, +) -> ResourceDaemonDecorator: + """ ``@kopf.daemon()`` decorator for the background threads/tasks. """ + def decorator(fn: callbacks.ResourceDaemonFn) -> callbacks.ResourceDaemonFn: + _warn_deprecated_signatures(fn) + _warn_deprecated_filters(labels, annotations) + real_registry = registry if registry is not None else registries.get_default_registry() + real_resource = resources.Resource(group, version, plural) + real_id = registries.generate_id(fn=fn, id=id) + handler = handlers.ResourceDaemonHandler( + fn=fn, id=real_id, + errors=errors, timeout=timeout, retries=retries, backoff=backoff, cooldown=cooldown, + labels=labels, annotations=annotations, when=when, + initial_backoff=initial_backoff, requires_finalizer=True, #TODO: requires_finalizer? "optional"? + cancellation_backoff=cancellation_backoff, + cancellation_timeout=cancellation_timeout, + cancellation_polling=cancellation_polling, + ) + real_registry.resource_spawning_handlers[real_resource].append(handler) + return fn + return decorator + + +def timer( # lgtm[py/similar-function] + group: str, version: str, plural: str, + *, + id: Optional[str] = None, + errors: Optional[handlers.ErrorsMode] = None, + timeout: Optional[float] = None, + retries: Optional[int] = None, + backoff: Optional[float] = None, + cooldown: Optional[float] = None, # deprecated, use `backoff` + registry: Optional[registries.OperatorRegistry] = None, + labels: Optional[filters.MetaFilter] = None, + annotations: Optional[filters.MetaFilter] = None, + when: Optional[callbacks.WhenFilterFn] = None, + initial_backoff: Optional[float] = None, + sharp: Optional[bool] = None, + idle: Optional[float] = None, + interval: Optional[float] = None, +) -> ResourceTimerDecorator: + """ ``@kopf.timer()`` handler for the regular events. """ + def decorator(fn: callbacks.ResourceTimerFn) -> callbacks.ResourceTimerFn: + _warn_deprecated_signatures(fn) + _warn_deprecated_filters(labels, annotations) + real_registry = registry if registry is not None else registries.get_default_registry() + real_resource = resources.Resource(group, version, plural) + real_id = registries.generate_id(fn=fn, id=id) + handler = handlers.ResourceTimerHandler( + fn=fn, id=real_id, + errors=errors, timeout=timeout, retries=retries, backoff=backoff, cooldown=cooldown, + labels=labels, annotations=annotations, when=when, + initial_backoff=initial_backoff, requires_finalizer=None, + sharp=sharp, idle=idle, interval=interval, + ) + real_registry.resource_spawning_handlers[real_resource].append(handler) + return fn + return decorator + + # TODO: find a better name: `@kopf.on.this` is confusing and does not fully # TODO: match with the `@kopf.on.{cause}` pattern, where cause is create/update/delete. def this( # lgtm[py/similar-function] diff --git a/kopf/reactor/causation.py b/kopf/reactor/causation.py index a02bc586..8e6a9f69 100644 --- a/kopf/reactor/causation.py +++ b/kopf/reactor/causation.py @@ -31,6 +31,7 @@ from kopf.structs import diffs from kopf.structs import handlers from kopf.structs import patches +from kopf.structs import primitives from kopf.structs import resources @@ -64,6 +65,17 @@ class ResourceWatchingCause(ResourceCause): raw: bodies.RawEvent +@dataclasses.dataclass +class ResourceSpawningCause(ResourceCause): + """ + An internal daemon is spawning: tasks, threads, timers. + + Used only on the first appearance of a resource as a container for resource- + specific objects (loggers, etc). + """ + reset: bool + + @dataclasses.dataclass class ResourceChangingCause(ResourceCause): """ @@ -89,6 +101,28 @@ def deleted(self) -> bool: return finalizers.is_deletion_ongoing(self.body) +@dataclasses.dataclass +class DaemonCause(ResourceCause): + """ + An exceptional case of a container for daemon invocation kwargs. + + Regular causes are usually short-term, triggered by a watch-stream event, + and disappear once the event is processed. The processing includes + daemon spawning: the original cause and its temporary watch-event + should not be remembered though the whole life cycle of a daemon. + + Instead, a new artificial daemon-cause is used (this class), which + passes the kwarg values to the invocation routines. It only contains + the long-living kwargs: loggers, per-daemon stoppers, body-views + (with only the latest bodies as contained values), etc. + + Unlike other causes, it is created not in the processing routines once + per event, but in the daemon spawning routines once per daemon (or a timer). + Therefore, it is not "detected", but is created directly as an instance. + """ + stopper: primitives.DaemonStopper # a signaller for the termination and its reason. + + def detect_resource_watching_cause( raw_event: bodies.RawEvent, body: bodies.Body, @@ -101,6 +135,15 @@ def detect_resource_watching_cause( **kwargs) +def detect_resource_spawning_cause( + body: bodies.Body, + **kwargs: Any, +) -> ResourceSpawningCause: + return ResourceSpawningCause( + body=body, + **kwargs) + + def detect_resource_changing_cause( *, raw_event: bodies.RawEvent, diff --git a/kopf/reactor/daemons.py b/kopf/reactor/daemons.py new file mode 100644 index 00000000..bf787e9b --- /dev/null +++ b/kopf/reactor/daemons.py @@ -0,0 +1,404 @@ +""" +Daemons are background tasks accompanying the individual resource objects. + +Every ``@kopf.daemon`` and ``@kopf.timer`` handler produces a separate +asyncio task to either directly execute the daemon, or to trigger one-shot +handlers by schedule. The wrapping tasks are always async; the sync functions +are called in thread executors as part of a regular handler invocation. + +These tasks are remembered in the per-resources *memories* (arbitrary data +containers) through the life-cycle of the operator. + +Since the operators are event-driven conceptually, there are no background tasks +running for every individual resources normally (i.e. without the daemons), +so there are no connectors between the operator's root tasks and the daemons, +so there is no way to stop/kill/cancel the daemons when the operator exits. + +For this, there is an artificial root task spawned to kill all the daemons +when the operator exits, and all root tasks are gracefully/forcedly terminated. +Otherwise, all the daemons would be considered as "hung" tasks and will be +force-killed after some timeout -- which can be avoided, since we are aware +of the daemons, and they are not actually "hung". +""" +import asyncio +import time +import warnings +from typing import MutableMapping, Sequence, Collection, List + +from kopf.clients import patching +from kopf.engines import logging as logging_engine +from kopf.engines import sleeping +from kopf.reactor import causation +from kopf.reactor import handling +from kopf.reactor import lifecycles +from kopf.storage import states +from kopf.structs import configuration +from kopf.structs import containers +from kopf.structs import handlers as handlers_ +from kopf.structs import patches +from kopf.structs import primitives + +DAEMON_POLLING_INTERVAL = 60 + + +async def spawn_resource_daemons( + *, + settings: configuration.OperatorSettings, + handlers: Sequence[handlers_.ResourceSpawningHandler], + daemons: MutableMapping[containers.DaemonId, containers.Daemon], + cause: causation.ResourceSpawningCause, + memory: containers.ResourceMemory, +) -> Collection[float]: + """ + Ensure that all daemons are spawned for this individual resource. + + This function can be called multiple times on multiple handling cycles + (though usually should be called on the first-seen occasion), so it must + be idempotent: not having duplicating side-effects on multiple calls. + """ + if memory.live_fresh_body is None: # for type-checking; "not None" is ensured in processing. + raise RuntimeError("A daemon is spawned with None as body. This is a bug. Please repor.") + for handler in handlers: + daemon_id = containers.DaemonId(handler.id) + if daemon_id not in daemons: + stopper = primitives.DaemonStopper() + daemon_cause = causation.DaemonCause( + resource=cause.resource, + logger=cause.logger, + body=memory.live_fresh_body, + memo=memory.user_data, + patch=patches.Patch(), # not the same as the one-shot spawning patch! + stopper=stopper, # for checking (passed to kwargs) + ) + daemon = containers.Daemon( + stopper=stopper, # for stopping (outside of causes) + handler=handler, + logger=logging_engine.LocalObjectLogger(body=cause.body, settings=settings), + task=asyncio.create_task(_runner( + settings=settings, + handler=handler, + cause=daemon_cause, + memory=memory, + )), + ) + daemons[daemon_id] = daemon + return [] + + +async def stop_resource_daemons( + *, + settings: configuration.OperatorSettings, + daemons: MutableMapping[containers.DaemonId, containers.Daemon], +) -> Collection[float]: + """ + Terminate all daemons of an individual resource (gracefully and by force). + + All daemons are terminated in parallel to speed up the termination + (especially taking into account that some daemons can take time to finish). + + The daemons are asked to terminate as soon as the object is marked + for deletion. It can take some time until the deletion handlers also + finish their work. The object is not physically deleted until all + the daemons are terminated (by putting a finalizer on it). + """ + delays: List[float] = [] + now = time.monotonic() + for daemon_id, daemon in daemons.items(): + logger = daemon.logger + stopper = daemon.stopper + age = (now - (stopper.when or now)) + + if isinstance(daemon.handler, handlers_.ResourceDaemonHandler): + backoff = daemon.handler.cancellation_backoff + timeout = daemon.handler.cancellation_timeout + polling = daemon.handler.cancellation_polling or DAEMON_POLLING_INTERVAL + elif isinstance(daemon.handler, handlers_.ResourceTimerHandler): + backoff = None + timeout = None + polling = DAEMON_POLLING_INTERVAL + else: + raise RuntimeError(f"Unsupported daemon handler: {daemon.handler!r}") + + # Whatever happens with other flags & logs & timings, this flag must be surely set. + stopper.set(reason=primitives.DaemonStoppingReason.RESOURCE_DELETED) + + # It might be so, that the daemon exits instantly (if written properly). Avoid patching and + # unnecessary handling cycles in this case: just give the asyncio event loop an extra cycle. + await asyncio.sleep(0) + + # Try different approaches to exiting the daemon based on timings. + if daemon.task.done(): + pass + + elif backoff is not None and age < backoff: + if not stopper.is_set(reason=primitives.DaemonStoppingReason.DAEMON_SIGNALLED): + stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_SIGNALLED) + logger.debug(f"Daemon {daemon_id!r} is signalled to exit gracefully.") + delays.append(backoff - age) + + elif timeout is not None and age < timeout + (backoff or 0): + if not stopper.is_set(reason=primitives.DaemonStoppingReason.DAEMON_CANCELLED): + stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_CANCELLED) + logger.debug(f"Daemon {daemon_id!r} is signalled to exit by force.") + daemon.task.cancel() + delays.append(timeout + (backoff or 0) - age) + + elif timeout is not None: + if not stopper.is_set(reason=primitives.DaemonStoppingReason.DAEMON_ABANDONED): + stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_ABANDONED) + logger.warning(f"Daemon {daemon_id!r} did not exit in time. Leaving it orphaned.") + warnings.warn(f"Daemon {daemon_id!r} did not exit in time.", ResourceWarning) + + else: + logger.debug(f"Daemon {daemon_id!r} is still exiting. Next check is in {polling}s.") + delays.append(polling) + + return delays + + +async def daemon_killer( + *, + settings: configuration.OperatorSettings, + memories: containers.ResourceMemories, +) -> None: + """ + An operator's root task to kill the daemons on the operator's shutdown. + """ + + # Sleep forever, or until cancelled, which happens when the operator begins its shutdown. + try: + await asyncio.Event().wait() + except asyncio.CancelledError: + pass + + # Terminate all running daemons when the operator exits (and this task is cancelled). + coros = [ + stop_daemon(daemon_id=daemon_id, daemon=daemon) + for memory in memories.iter_all_memories() + for daemon_id, daemon in memory.daemons.items() + ] + if coros: + await asyncio.wait(coros) + + +async def stop_daemon( + *, + daemon_id: containers.DaemonId, + daemon: containers.Daemon, +) -> None: + """ + Stop a single daemon. + + The purpose is the same as in `stop_resource_daemons`, but this function + is called on operator exiting, so there is no multi-step handling, + everything happens in memory and linearly (while respecting the timing). + """ + if isinstance(daemon.handler, handlers_.ResourceDaemonHandler): + backoff = daemon.handler.cancellation_backoff + timeout = daemon.handler.cancellation_timeout + elif isinstance(daemon.handler, handlers_.ResourceTimerHandler): + backoff = None + timeout = None + else: + raise RuntimeError(f"Unsupported daemon handler: {daemon.handler!r}") + + # Whatever happens with other flags & logs & timings, this flag must be surely set. + daemon.stopper.set(reason=primitives.DaemonStoppingReason.OPERATOR_EXITING) + + # It might be so, that the daemon exits instantly (if written properly). + # Avoid waiting and cancelling below: just give the asyncio event loop an extra cycle. + await asyncio.sleep(0) + + # Try different approaches to exiting the daemon based on timings. + if not daemon.task.done() and backoff is not None: + daemon.stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_SIGNALLED) + daemon.logger.debug(f"Daemon {daemon_id!r} is signalled to exit gracefully.") + await asyncio.wait([daemon.task], timeout=backoff) + + if not daemon.task.done() and timeout is not None: + daemon.stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_CANCELLED) + daemon.logger.debug(f"Daemon {daemon_id!r} is signalled to exit by force.") + daemon.task.cancel() + await asyncio.wait([daemon.task], timeout=timeout) + + if not daemon.task.done(): + daemon.stopper.set(reason=primitives.DaemonStoppingReason.DAEMON_ABANDONED) + daemon.logger.warning(f"Daemon {daemon_id!r} did not exit in time. Leaving it orphaned.") + warnings.warn(f"Daemon {daemon_id!r} did not exit in time.", ResourceWarning) + + +async def _runner( + *, + settings: configuration.OperatorSettings, + handler: handlers_.ResourceSpawningHandler, + memory: containers.ResourceMemory, + cause: causation.DaemonCause, +) -> None: + """ + Guard a running daemon during its life cycle. + + Note: synchronous daemons are awaited to the exit and postpone cancellation. + The runner will not exit until the thread exits. See `invoke` for details. + """ + try: + if isinstance(handler, handlers_.ResourceDaemonHandler): + await _resource_daemon(settings=settings, handler=handler, cause=cause) + elif isinstance(handler, handlers_.ResourceTimerHandler): + await _resource_timer(settings=settings, handler=handler, cause=cause, memory=memory) + else: + raise RuntimeError("Cannot determine which task wrapper to use. This is a bug.") + + finally: + # Whatever happened, make sure the sync threads of asyncio threaded executor are notified: + # in a hope that they will exit maybe some time later to free the OS/asyncio resources. + # A possible case: operator is exiting and cancelling all "hung" non-root tasks, etc. + cause.stopper.set(reason=primitives.DaemonStoppingReason.DONE) + + +async def _resource_daemon( + *, + settings: configuration.OperatorSettings, + handler: handlers_.ResourceDaemonHandler, + cause: causation.DaemonCause, +) -> None: + """ + A long-running guarding task for a resource daemon handler. + + The handler is executed either once or repeatedly, based on the handler + declaration. + + Few kinds of errors are suppressed, those expected from the daemons when + they are cancelled due to the resource deletion. + """ + logger = cause.logger + + if handler.initial_backoff is not None: + await sleeping.sleep_or_wait(handler.initial_backoff, cause.stopper) + + # Similar to activities (in-memory execution), but applies patches on every attempt. + state = states.State.from_scratch(handlers=[handler]) + while not cause.stopper.is_set() and not state.done: + + outcomes = await handling.execute_handlers_once( + lifecycle=lifecycles.all_at_once, # there is only one anyway + settings=settings, + handlers=[handler], + cause=cause, + state=state, + ) + state = state.with_outcomes(outcomes) + states.deliver_results(outcomes=outcomes, patch=cause.patch) + + if cause.patch: + cause.logger.debug("Patching with: %r", cause.patch) + await patching.patch_obj(resource=cause.resource, patch=cause.patch, body=cause.body) + cause.patch.clear() + + # The in-memory sleep does not react to resource changes, but only to stopping. + await sleeping.sleep_or_wait(state.delay, cause.stopper) + + if cause.stopper.is_set(): + logger.debug(f"{handler} has exited on request and will not be retried or restarted.") + else: + logger.debug(f"{handler} has exited on its own and will not be retried or restarted.") + + +async def _resource_timer( + *, + settings: configuration.OperatorSettings, + handler: handlers_.ResourceTimerHandler, + memory: containers.ResourceMemory, + cause: causation.DaemonCause, +) -> None: + """ + A long-running guarding task for resource timer handlers. + + Each individual handler for each individual k8s-object gets its own task. + Despite asyncio can schedule the delayed execution of the callbacks + with ``loop.call_later()`` and ``loop.call_at()``, we do not use them: + + * First, the callbacks are synchronous, making it impossible to patch + the k8s-objects with the returned results of the handlers. + + * Second, our timers are more sophisticated: they track the last-seen time, + obey the idle delays, and are instantly terminated/cancelled on the object + deletion or on the operator exit. + + * Third, sharp timing would require an external timestamp storage anyway, + which is easier to keep as a local variable inside of a function. + + It is hard to implement all of this with native asyncio timers. + It is much easier to have an extra task which mostly sleeps, + but calls the handling functions from time to time. + """ + + if handler.initial_backoff is not None: + await sleeping.sleep_or_wait(handler.initial_backoff, cause.stopper) + + # Similar to activities (in-memory execution), but applies patches on every attempt. + state = states.State.from_scratch(handlers=[handler]) + while not cause.stopper.is_set(): # NB: ignore state.done! it is checked below explicitly. + + # Reset success/failure retry counters & timers if it has succeeded. Keep it if failed. + # Every next invocation of a successful handler starts the retries from scratch (from zero). + if state.done: + state = states.State.from_scratch(handlers=[handler]) + + # Both `now` and `last_seen_time` are moving targets: the last seen time is updated + # on every watch-event received, and prolongs the sleep. The sleep is never shortened. + if handler.idle is not None: + while not cause.stopper.is_set() and time.monotonic() - memory.idle_reset_time < handler.idle: + delay = memory.idle_reset_time + handler.idle - time.monotonic() + await sleeping.sleep_or_wait(delay, cause.stopper) + if cause.stopper.is_set(): + continue + + # Remember the start time for the sharp timing and idle-time-waster below. + started = time.monotonic() + + # Execute the handler as usually, in-memory, but handle its outcome on every attempt. + outcomes = await handling.execute_handlers_once( + lifecycle=lifecycles.all_at_once, # there is only one anyway + settings=settings, + handlers=[handler], + cause=cause, + state=state, + ) + state = state.with_outcomes(outcomes) + states.deliver_results(outcomes=outcomes, patch=cause.patch) + + # Apply the accumulated patches after every invocation attempt (regardless of its outcome). + if cause.patch: + cause.logger.debug("Patching with: %r", cause.patch) + await patching.patch_obj(resource=cause.resource, patch=cause.patch, body=cause.body) + cause.patch.clear() + + # For temporary errors, override the schedule by the one provided by errors themselves. + # It can be either a delay from TemporaryError, or a backoff for an arbitrary exception. + if not state.done: + await sleeping.sleep_or_wait(state.delays, cause.stopper) + + # For sharp timers, calculate how much time is left to fit the interval grid: + # |-----|-----|-----|-----|-----|-----|---> (interval=5, sharp=True) + # [slow_handler]....[slow_handler]....[slow... + elif handler.interval is not None and handler.sharp: + passed_duration = time.monotonic() - started + remaining_delay = handler.interval - (passed_duration % handler.interval) + await sleeping.sleep_or_wait(remaining_delay, cause.stopper) + + # For regular (non-sharp) timers, simply sleep from last exit to the next call: + # |-----|-----|-----|-----|-----|-----|---> (interval=5, sharp=False) + # [slow_handler].....[slow_handler].....[slow... + elif handler.interval is not None: + await sleeping.sleep_or_wait(handler.interval, cause.stopper) + + # For idle-only no-interval timers, wait till the next change (i.e. idling reset). + # NB: This will skip the handler in the same tact (1/64th of a second) even if changed. + elif handler.idle is not None: + while memory.idle_reset_time <= started: + await sleeping.sleep_or_wait(handler.idle, cause.stopper) + + # Only in case there are no intervals and idling, treat it as a one-shot handler. + # This makes the handler practically meaningless, but technically possible. + else: + break diff --git a/kopf/reactor/handling.py b/kopf/reactor/handling.py index eb6588e2..cd5d5f75 100644 --- a/kopf/reactor/handling.py +++ b/kopf/reactor/handling.py @@ -7,6 +7,7 @@ The handler execution can also be used in other places, such as in-memory activities, when there is no underlying Kubernetes object to patch'n'watch. """ +import asyncio import collections.abc import logging from contextvars import ContextVar @@ -58,7 +59,7 @@ class HandlerChildrenRetry(TemporaryError): # The task-local context; propagated down the stack instead of multiple kwargs. # Used in `@kopf.on.this` and `kopf.execute()` to add/get the sub-handlers. -sublifecycle_var: ContextVar[lifecycles.LifeCycleFn] = ContextVar('sublifecycle_var') +sublifecycle_var: ContextVar[Optional[lifecycles.LifeCycleFn]] = ContextVar('sublifecycle_var') subregistry_var: ContextVar[registries.ResourceChangingRegistry] = ContextVar('subregistry_var') subsettings_var: ContextVar[configuration.OperatorSettings] = ContextVar('subsettings_var') subexecuted_var: ContextVar[bool] = ContextVar('subexecuted_var') @@ -92,6 +93,7 @@ async def execute( # Restore the current context as set in the handler execution cycle. lifecycle = lifecycle if lifecycle is not None else sublifecycle_var.get() + lifecycle = lifecycle if lifecycle is not None else lifecycles.get_default_lifecycle() cause = cause if cause is not None else cause_var.get() parent_handler: handlers_.BaseHandler = handler_var.get() parent_prefix = parent_handler.id if parent_handler is not None else None @@ -216,16 +218,21 @@ async def execute_handler_once( handler: handlers_.BaseHandler, cause: causation.BaseCause, state: states.HandlerState, - lifecycle: lifecycles.LifeCycleFn, + lifecycle: Optional[lifecycles.LifeCycleFn] = None, default_errors: handlers_.ErrorsMode = handlers_.ErrorsMode.TEMPORARY, ) -> states.HandlerOutcome: """ - Execute one and only one handler. + Execute one and only one handler for one and only one time. *Execution* means not just *calling* the handler in properly set context (see `_call_handler`), but also interpreting its result and errors, and wrapping them into am `HandlerOutcome` object -- to be stored in the state. + The *execution* can be long -- depending on how the handler is implemented. + For daemons, it is normal to run for hours and days if needed. + This is different from the regular handlers, which are supposed + to be finished as soon as possible. + This method is not supposed to raise any exceptions from the handlers: exceptions mean the failure of execution itself. """ @@ -259,6 +266,11 @@ async def execute_handler_once( lifecycle=lifecycle, # just a default for the sub-handlers, not used directly. ) + # The cancellations are an excepted way of stopping the handler. Especially for daemons. + except asyncio.CancelledError: + logger.warning(f"{handler} is cancelled. Will escalate.") + raise + # Unfinished children cause the regular retry, but with less logging and event reporting. except HandlerChildrenRetry as e: logger.debug(f"{handler} has unfinished sub-handlers. Will retry soon.") @@ -307,7 +319,7 @@ async def invoke_handler( *args: Any, cause: causation.BaseCause, settings: configuration.OperatorSettings, - lifecycle: lifecycles.LifeCycleFn, + lifecycle: Optional[lifecycles.LifeCycleFn], **kwargs: Any, ) -> Optional[callbacks.Result]: """ diff --git a/kopf/reactor/invocation.py b/kopf/reactor/invocation.py index 49955600..cd1a2347 100644 --- a/kopf/reactor/invocation.py +++ b/kopf/reactor/invocation.py @@ -41,6 +41,7 @@ def context( def build_kwargs( cause: Optional[causation.BaseCause] = None, + _sync: Optional[bool] = None, **kwargs: Any ) -> Dict[str, Any]: """ @@ -90,7 +91,10 @@ def build_kwargs( old=cause.old, new=cause.new, ) - + if isinstance(cause, causation.DaemonCause) and _sync is not None: + new_kwargs.update( + stopped=cause.stopper.sync_checker if _sync else cause.stopper.async_checker, + ) return new_kwargs @@ -116,11 +120,11 @@ async def invoke( thus making it non-blocking for the main event loop of the operator. See: https://pymotw.com/3/asyncio/executors.html """ - kwargs = build_kwargs(cause=cause, **kwargs) - if is_async_fn(fn): + kwargs = build_kwargs(cause=cause, _sync=False, **kwargs) result = await fn(*args, **kwargs) # type: ignore else: + kwargs = build_kwargs(cause=cause, _sync=True, **kwargs) # Not that we want to use functools, but for executors kwargs, it is officially recommended: # https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index cbbcd38d..a3303758 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -15,6 +15,7 @@ """ import asyncio import datetime +import time from typing import Collection, Optional from kopf.clients import patching @@ -22,6 +23,7 @@ from kopf.engines import posting from kopf.engines import sleeping from kopf.reactor import causation +from kopf.reactor import daemons from kopf.reactor import handling from kopf.reactor import lifecycles from kopf.reactor import registries @@ -59,26 +61,28 @@ async def process_resource_event( and therefore do not call the handling logic. """ + # Recall what is stored about that object. Share it in little portions with the consumers. + # And immediately forget it if the object is deleted from the cluster (but keep in memory). + raw_type, raw_body = raw_event['type'], raw_event['object'] + memory = await memories.recall(raw_body, noticed_by_listing=raw_type is None) + if memory.live_fresh_body is not None: + memory.live_fresh_body._replace_with(raw_body) + if raw_type == 'DELETED': + await memories.forget(raw_body) + # Convert to a heavy mapping-view wrapper only now, when heavy processing begins. # Raw-event streaming, queueing, and batching use regular lightweight dicts. # Why here? 1. Before it splits into multiple causes & handlers for the same object's body; # 2. After it is batched (queueing); 3. While the "raw" parsed JSON is still known; # 4. Same as where a patch object of a similar wrapping semantics is created. - body = bodies.Body(raw_event['object']) + body = memory.live_fresh_body if memory.live_fresh_body is not None else bodies.Body(raw_body) patch = patches.Patch() - delay: Optional[float] = None # Each object has its own prefixed logger, to distinguish parallel handling. logger = logging_engine.ObjectLogger(body=body, settings=settings) posting.event_queue_loop_var.set(asyncio.get_running_loop()) posting.event_queue_var.set(event_queue) # till the end of this object's task. - # Recall what is stored about that object. Share it in little portions with the consumers. - # And immediately forget it if the object is deleted from the cluster (but keep in memory). - memory = await memories.recall(body, noticed_by_listing=raw_event['type'] is None) - if raw_event['type'] == 'DELETED': - await memories.forget(body) - extra_fields = registry.resource_changing_handlers[resource].get_extra_fields() old = settings.persistence.diffbase_storage.fetch(body=body) new = settings.persistence.diffbase_storage.build(body=body, extra_fields=extra_fields) @@ -96,6 +100,15 @@ async def process_resource_event( memo=memory.user_data, ) if registry.resource_watching_handlers[resource] else None + resource_spawning_cause = causation.detect_resource_spawning_cause( + resource=resource, + logger=logger, + patch=patch, + body=body, + memo=memory.user_data, + reset=bool(diff), # only essential changes reset idling, not every event + ) if registry.resource_spawning_handlers[resource] else None + resource_changing_cause = causation.detect_resource_changing_cause( raw_event=raw_event, resource=resource, @@ -116,6 +129,9 @@ async def process_resource_event( deletion_is_ongoing = finalizers.is_deletion_ongoing(body=body) deletion_is_blocked = finalizers.is_deletion_blocked(body=body) deletion_must_be_blocked = ( + (resource_spawning_cause is not None and + registry.resource_spawning_handlers[resource].requires_finalizer(resource_spawning_cause)) + or (resource_changing_cause is not None and registry.resource_changing_handlers[resource].requires_finalizer(resource_changing_cause))) @@ -130,6 +146,8 @@ async def process_resource_event( resource_changing_cause = None # prevent further high-level processing this time # Invoke all the handlers that should or could be invoked at this processing cycle. + # The low-level spies go ASAP always. However, the daemons are spawned before the high-level + # handlers and killed after them: the daemons should live throughout the full object lifecycle. if resource_watching_cause is not None: await process_resource_watching_cause( lifecycle=lifecycles.all_at_once, @@ -138,8 +156,15 @@ async def process_resource_event( cause=resource_watching_cause, ) - # Object patch accumulator. Populated by the methods. Applied in the end of the handler. - # Detect the cause and handle it (or at least log this happened). + resource_spawning_delays: Collection[float] = [] + if resource_spawning_cause is not None: + resource_spawning_delays = await process_resource_spawning_cause( + registry=registry, + settings=settings, + memory=memory, + cause=resource_spawning_cause, + ) + resource_changing_delays: Collection[float] = [] if resource_changing_cause is not None: resource_changing_delays = await process_resource_changing_cause( @@ -152,7 +177,9 @@ async def process_resource_event( # Release the object if everything is done, and it is marked for deletion. # But not when it has already gone. - if deletion_is_ongoing and deletion_is_blocked and not resource_changing_delays: + if deletion_is_ongoing and deletion_is_blocked \ + and not resource_spawning_delays \ + and not resource_changing_delays: logger.debug("Removing the finalizer, thus allowing the actual deletion.") finalizers.allow_deletion(body=body, patch=patch) @@ -162,7 +189,7 @@ async def process_resource_event( if raw_event['type'] != 'DELETED': await apply_reaction_outcomes( resource=resource, body=body, - patch=patch, delays=resource_changing_delays, + patch=patch, delays=list(resource_spawning_delays) + list(resource_changing_delays), logger=logger, replenished=replenished) @@ -239,6 +266,56 @@ async def process_resource_watching_cause( states.deliver_results(outcomes=outcomes, patch=cause.patch) +async def process_resource_spawning_cause( + registry: registries.OperatorRegistry, + settings: configuration.OperatorSettings, + memory: containers.ResourceMemory, + cause: causation.ResourceSpawningCause, +) -> Collection[float]: + """ + Spawn/kill all the background tasks of a resource. + + The spawning and killing happens in parallel with the resource-changing + handlers invocation (even if it takes few cycles). For this, the signal + to terminate is sent to the daemons immediately, but the actual check + of their shutdown is performed only when all the on-deletion handlers + are succeeded (or after they were invoked if they are optional; + or immediately if there were no on-deletion handlers to invoke at all). + + The resource remains blocked by the finalizers until all the daemons exit + (except those marked as tolerating being orphaned). + """ + + # Refresh the up-to-date body & essential timestamp for all the daemons/timers. + if memory.live_fresh_body is None: + memory.live_fresh_body = cause.body + if cause.reset: + memory.idle_reset_time = time.monotonic() + + if finalizers.is_deletion_ongoing(cause.body): + delays = await daemons.stop_resource_daemons( + settings=settings, + daemons=memory.daemons, + ) + return delays + + # Save a little bit of CPU on handlers selection if everything is spawned already. + elif not memory.fully_spawned: + handlers = registry.resource_spawning_handlers[cause.resource].get_handlers(cause=cause) + delays = await daemons.spawn_resource_daemons( + settings=settings, + daemons=memory.daemons, + cause=cause, + memory=memory, + handlers=handlers, + ) + memory.fully_spawned = not delays + return delays + + else: + return [] + + async def process_resource_changing_cause( lifecycle: lifecycles.LifeCycleFn, registry: registries.OperatorRegistry, diff --git a/kopf/reactor/registries.py b/kopf/reactor/registries.py index 6d03df4d..fb8d7e50 100644 --- a/kopf/reactor/registries.py +++ b/kopf/reactor/registries.py @@ -35,6 +35,7 @@ HandlerFnT = TypeVar('HandlerFnT', callbacks.ActivityFn, callbacks.ResourceWatchingFn, + callbacks.ResourceSpawningFn, callbacks.ResourceChangingFn, Union[callbacks.ResourceWatchingFn, callbacks.ResourceChangingFn]) # DEPRECATED: for legacy_registries @@ -193,6 +194,21 @@ def iter_handlers( yield handler +class ResourceSpawningRegistry(ResourceRegistry[ + causation.ResourceSpawningCause, + callbacks.ResourceSpawningFn, + handlers.ResourceSpawningHandler]): + + @abc.abstractmethod + def iter_handlers( + self, + cause: causation.ResourceSpawningCause, + ) -> Iterator[handlers.ResourceSpawningHandler]: + for handler in self._handlers: + if match(handler=handler, cause=cause): + yield handler + + class ResourceChangingRegistry(ResourceRegistry[ causation.ResourceChangingCause, callbacks.ResourceChangingFn, @@ -261,18 +277,21 @@ class OperatorRegistry: """ activity_handlers: ActivityRegistry resource_watching_handlers: MutableMapping[resources_.Resource, ResourceWatchingRegistry] + resource_spawning_handlers: MutableMapping[resources_.Resource, ResourceSpawningRegistry] resource_changing_handlers: MutableMapping[resources_.Resource, ResourceChangingRegistry] def __init__(self) -> None: super().__init__() self.activity_handlers = ActivityRegistry() self.resource_watching_handlers = collections.defaultdict(ResourceWatchingRegistry) + self.resource_spawning_handlers = collections.defaultdict(ResourceSpawningRegistry) self.resource_changing_handlers = collections.defaultdict(ResourceChangingRegistry) @property def resources(self) -> FrozenSet[resources_.Resource]: """ All known resources in the registry. """ return (frozenset(self.resource_watching_handlers) | + frozenset(self.resource_spawning_handlers) | frozenset(self.resource_changing_handlers)) # diff --git a/kopf/reactor/running.py b/kopf/reactor/running.py index eea57c94..48dccd6c 100644 --- a/kopf/reactor/running.py +++ b/kopf/reactor/running.py @@ -12,6 +12,7 @@ from kopf.engines import posting from kopf.engines import probing from kopf.reactor import activities +from kopf.reactor import daemons from kopf.reactor import lifecycles from kopf.reactor import processing from kopf.reactor import queueing @@ -217,6 +218,15 @@ async def spawn_tasks( )), ]) + # Kill all the daemons gracefully when the operator exits (so that they are not "hung"). + tasks.extend([ + loop.create_task(_root_task_checker( + name="daemon killer", ready_flag=ready_flag, + coro=daemons.daemon_killer( + settings=settings, + memories=memories))), + ]) + # Keeping the credentials fresh and valid via the authentication handlers on demand. tasks.extend([ loop.create_task(_root_task_checker( diff --git a/kopf/structs/callbacks.py b/kopf/structs/callbacks.py index ff560b5d..1f1ecf3e 100644 --- a/kopf/structs/callbacks.py +++ b/kopf/structs/callbacks.py @@ -12,6 +12,7 @@ from kopf.structs import bodies from kopf.structs import diffs from kopf.structs import patches +from kopf.structs import primitives # A specialised type to highlight the purpose or origin of the data of type Any, # to not be mixed with other arbitrary Any values, where it is indeed "any". @@ -76,6 +77,62 @@ def __call__( # lgtm[py/similar-function] ) -> _SyncOrAsyncResult: ... +class ResourceDaemonSyncFn(Protocol): + def __call__( # lgtm[py/similar-function] # << different mode + self, + *args: Any, + body: bodies.Body, + meta: bodies.Meta, + spec: bodies.Spec, + status: bodies.Status, + uid: str, + name: str, + namespace: Optional[str], + logger: Union[logging.Logger, logging.LoggerAdapter], + stopped: primitives.SyncDaemonStopperChecker, # << different type + **kwargs: Any, + ) -> Optional[Result]: ... + + +class ResourceDaemonAsyncFn(Protocol): + async def __call__( # lgtm[py/similar-function] # << different mode + self, + *args: Any, + body: bodies.Body, + meta: bodies.Meta, + spec: bodies.Spec, + status: bodies.Status, + uid: str, + name: str, + namespace: Optional[str], + logger: Union[logging.Logger, logging.LoggerAdapter], + stopped: primitives.AsyncDaemonStopperChecker, # << different type + **kwargs: Any, + ) -> Optional[Result]: ... + + +ResourceDaemonFn = Union[ResourceDaemonSyncFn, ResourceDaemonAsyncFn] + + +class ResourceTimerFn(Protocol): + def __call__( # lgtm[py/similar-function] + self, + *args: Any, + body: bodies.Body, + meta: bodies.Meta, + spec: bodies.Spec, + status: bodies.Status, + uid: str, + name: str, + namespace: Optional[str], + logger: Union[logging.Logger, logging.LoggerAdapter], + **kwargs: Any, + ) -> _SyncOrAsyncResult: ... + + +ResourceSpawningFn = Union[ResourceDaemonFn, ResourceTimerFn] + + class WhenFilterFn(Protocol): def __call__( # lgtm[py/similar-function] self, diff --git a/kopf/structs/containers.py b/kopf/structs/containers.py index 6eebd5f2..6333b861 100644 --- a/kopf/structs/containers.py +++ b/kopf/structs/containers.py @@ -7,10 +7,33 @@ It is used internally to track allocated system resources for each Kubernetes object, even if that object does not show up in the event streams for long time. """ +import asyncio import dataclasses -from typing import MutableMapping, Dict, Any +import logging +import time +from typing import MutableMapping, Dict, Any, Iterator, Optional, Union, NewType, TYPE_CHECKING from kopf.structs import bodies +from kopf.structs import handlers +from kopf.structs import primitives + + +if TYPE_CHECKING: + asyncio_Task = asyncio.Task[None] + asyncio_Future = asyncio.Future[Any] +else: + asyncio_Task = asyncio.Task + asyncio_Future = asyncio.Future + +DaemonId = NewType('DaemonId', str) + + +@dataclasses.dataclass(frozen=True) +class Daemon: + task: asyncio_Task # a guarding task of the daemon. + logger: Union[logging.Logger, logging.LoggerAdapter] + handler: handlers.ResourceSpawningHandler + stopper: primitives.DaemonStopper # a signaller for the termination and its reason. class ObjectDict(Dict[Any, Any]): @@ -43,6 +66,12 @@ class ResourceMemory: noticed_by_listing: bool = False fully_handled_once: bool = False + # For background and timed threads/tasks (invoked with the kwargs of the last-seen body). + live_fresh_body: Optional[bodies.Body] = None + idle_reset_time: float = dataclasses.field(default_factory=time.monotonic) + daemons: Dict[DaemonId, Daemon] = dataclasses.field(default_factory=dict) + fully_spawned: bool = False + class ResourceMemories: """ @@ -68,32 +97,41 @@ def __init__(self) -> None: super().__init__() self._items = {} + def iter_all_memories(self) -> Iterator[ResourceMemory]: + for memory in self._items.values(): + yield memory + async def recall( self, - body: bodies.Body, + raw_body: bodies.RawBody, *, noticed_by_listing: bool = False, ) -> ResourceMemory: """ Either find a resource's memory, or create and remember a new one. + + Keep the last-seen body up to date for all the handlers. """ - key = self._build_key(body) + key = self._build_key(raw_body) if key not in self._items: memory = ResourceMemory(noticed_by_listing=noticed_by_listing) self._items[key] = memory return self._items[key] - async def forget(self, body: bodies.Body) -> None: + async def forget( + self, + raw_body: bodies.RawBody, + ) -> None: """ Forget the resource's memory if it exists; or ignore if it does not. """ - key = self._build_key(body) + key = self._build_key(raw_body) if key in self._items: del self._items[key] def _build_key( self, - body: bodies.Body, + raw_body: bodies.RawBody, ) -> str: """ Construct an immutable persistent key of a resource. @@ -104,4 +142,4 @@ def _build_key( But it must be consistent within a single process lifetime. """ - return body.get('metadata', {}).get('uid') or '' + return raw_body.get('metadata', {}).get('uid') or '' diff --git a/kopf/structs/handlers.py b/kopf/structs/handlers.py index fb314799..a7a7d9b1 100644 --- a/kopf/structs/handlers.py +++ b/kopf/structs/handlers.py @@ -134,3 +134,31 @@ class ResourceChangingHandler(ResourceHandler): def event(self) -> Optional[Reason]: warnings.warn("handler.event is deprecated; use handler.reason.", DeprecationWarning) return self.reason + + +@dataclasses.dataclass +class ResourceSpawningHandler(ResourceHandler): + requires_finalizer: Optional[bool] + initial_backoff: Optional[float] + + +@dataclasses.dataclass +class ResourceDaemonHandler(ResourceSpawningHandler): + fn: callbacks.ResourceDaemonFn # type clarification + cancellation_backoff: Optional[float] # how long to wait before actual cancellation. + cancellation_timeout: Optional[float] # how long to wait before giving up on cancellation. + cancellation_polling: Optional[float] # how often to check for cancellation status. + + def __str__(self) -> str: + return f"Daemon {self.id!r}" + + +@dataclasses.dataclass +class ResourceTimerHandler(ResourceSpawningHandler): + fn: callbacks.ResourceTimerFn # type clarification + sharp: Optional[bool] + idle: Optional[float] + interval: Optional[float] + + def __str__(self) -> str: + return f"Timer {self.id!r}" diff --git a/kopf/structs/primitives.py b/kopf/structs/primitives.py index 5fd66539..ae86a519 100644 --- a/kopf/structs/primitives.py +++ b/kopf/structs/primitives.py @@ -3,10 +3,18 @@ """ import asyncio import concurrent.futures +import enum import threading -from typing import Optional, Union, Any +import time +from typing import Optional, Union, Any, TYPE_CHECKING -Flag = Union[asyncio.Future, asyncio.Event, concurrent.futures.Future, threading.Event] + +if TYPE_CHECKING: + asyncio_Future = asyncio.Future[Any] +else: + asyncio_Future = asyncio.Future + +Flag = Union[asyncio_Future, asyncio.Event, concurrent.futures.Future, threading.Event] async def wait_flag( @@ -113,3 +121,122 @@ async def wait_for_off(self) -> None: """ Wait until the toggle is turned off (if not yet). """ async with self._condition: await self._condition.wait_for(lambda: not self._state) + + +class DaemonStoppingReason(enum.Flag): + """ + A reason or reasons of daemon being terminated. + + Daemons are signalled to exit usually for two reasons: the operator itself + is exiting or restarting, so all daemons of all resources must stop; + or the individual resource was deleted, but the operator continues running. + + No matter the reason, the daemons must exit, so one and only one stop-flag + is used. Some daemons can check the reason of exiting if it is important. + + There can be multiple reasons combined (in rare cases, all of them). + """ + NONE = 0 + DONE = enum.auto() # whatever the reason and the status, the asyncio task has exited. + RESOURCE_DELETED = enum.auto() # the resource was deleted, the asyncio task is still awaited. + OPERATOR_EXITING = enum.auto() # the operator is exiting, the asyncio task is still awaited. + DAEMON_SIGNALLED = enum.auto() # the stopper flag was set, the asyncio task is still awaited. + DAEMON_CANCELLED = enum.auto() # the asyncio task was cancelled, the thread can be running. + DAEMON_ABANDONED = enum.auto() # we gave up on the asyncio task, the thread can be running. + + +class DaemonStopper: + """ + A boolean flag indicating that the daemon should stop and exit. + + Every daemon gets a ``stopper`` kwarg, which is an event-like object. + The stopper is raised in two cases: + + * The corresponding k8s object is deleted, so the daemon should stop. + * The whole operator is stopping, so all the daemons should stop too. + + The stopper flag is a graceful way of a daemon termination. + If the daemons do not react to their stoppers, and continue running, + their tasks are cancelled by raising a `asyncio.CancelledError`. + + .. warning:: + In case of synchronous handlers, which are executed in the threads, + this can lead to the OS resource leakage: + there is no way to kill a thread in Python, so it will continue running + forever or until failed (e.g. on an API call for an absent resource). + The orphan threads will block the operator's process from exiting, + thus affecting the speed of restarts. + """ + + def __init__(self) -> None: + super().__init__() + self.when: Optional[float] = None + self.reason = DaemonStoppingReason.NONE + self.sync_checker = SyncDaemonStopperChecker(self) + self.async_checker = AsyncDaemonStopperChecker(self) + self.sync_event = threading.Event() + self.async_event = asyncio.Event() + + def __repr__(self) -> str: + return f'<{self.__class__.__name__}: {self.is_set()}, reason={self.reason}>' + + def is_set(self, reason: Optional[DaemonStoppingReason] = None) -> bool: + """ + Check if the daemon stopper is set: at all or for specific reason. + """ + return ((reason is None or reason in self.reason) and self.sync_event.is_set()) + + def set(self, *, reason: DaemonStoppingReason) -> None: + self.when = self.when if self.when is not None else time.monotonic() + self.reason |= reason + self.sync_event.set() + self.async_event.set() # it is thread-safe: always called in operator's event loop. + + +class DaemonStopperChecker: + + """ + A minimalistic read-only checker for the daemons from the user side. + + This object is fed into the :kwarg:`stopped` kwarg for the handlers. + + The actual stopper is hidden from the users, and is an internal class. + The users should not be able to trigger the stopping activities or + check the reasons of stopping (or know about them at all). + + Usage:: + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + def handler(stopped, **kwargs): + while not stopped: + ... + stopped.wait(60) + """ + + def __init__(self, stopper: DaemonStopper) -> None: + super().__init__() + self._stopper = stopper + + def __repr__(self) -> str: + return repr(self._stopper) + + def __bool__(self) -> bool: + return self._stopper.is_set() + + def is_set(self) -> bool: + return self._stopper.is_set() + + +class SyncDaemonStopperChecker(DaemonStopperChecker): + def wait(self, timeout: Optional[float] = None) -> bool: + self._stopper.sync_event.wait(timeout=timeout) + return bool(self) + + +class AsyncDaemonStopperChecker(DaemonStopperChecker): + async def wait(self, timeout: Optional[float] = None) -> bool: + try: + await asyncio.wait_for(self._stopper.async_event.wait(), timeout=timeout) + except asyncio.TimeoutError: + pass + return bool(self) From 26eef182163f87296f70c1369eb39f7b5e249152 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Mon, 30 Mar 2020 15:36:05 +0200 Subject: [PATCH 10/20] Explain different implementations of daemons termination methods --- kopf/reactor/daemons.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kopf/reactor/daemons.py b/kopf/reactor/daemons.py index bf787e9b..81a0fd09 100644 --- a/kopf/reactor/daemons.py +++ b/kopf/reactor/daemons.py @@ -100,6 +100,35 @@ async def stop_resource_daemons( for deletion. It can take some time until the deletion handlers also finish their work. The object is not physically deleted until all the daemons are terminated (by putting a finalizer on it). + + **Notes on this non-trivial implementation:** + + There is a same-purpose function `stop_daemon`, which works fully in-memory. + That method is used when killing the daemons on operator exit. + This method is used when the resource is deleted. + + The difference is that in this method (termination with delays and patches), + other on-deletion handlers can be happening at the same time as the daemons + are being terminated (it can take time due to backoffs and timeouts). + In the end, the finalizer should be removed only once all deletion handlers + have succeeded and all daemons are terminated -- not earlier than that. + None of this (handlers and finalizers) is needed for the operator exiting. + + To know "when" the next check of daemons should be performed: + + * EITHER the operator should block this resource's processing and wait until + the daemons are terminated -- thus leaking daemon's abstractions and logic + and tools (e.g. aiojobs scheduler) to the upper level of processing; + + * OR the daemons termination should mimic the change-detection handlers + and simulate the delays with multiple handling cycles -- in order to + re-check the daemon's status regularly until they are done. + + Both of this approaches have the same complexity. But the latter one + keep the logic isolated into the daemons module/routines (a bit cleaner). + + Hence, these duplicating methods of termination for different cases + (as by their surrounding circumstances: deletion handlers and finalizers). """ delays: List[float] = [] now = time.monotonic() @@ -192,6 +221,8 @@ async def stop_daemon( The purpose is the same as in `stop_resource_daemons`, but this function is called on operator exiting, so there is no multi-step handling, everything happens in memory and linearly (while respecting the timing). + + For explanation on different implementations, see `stop_resource_daemons`. """ if isinstance(daemon.handler, handlers_.ResourceDaemonHandler): backoff = daemon.handler.cancellation_backoff From 41accd8d7286c0b9cb0a21f2203b8036a1c85022 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Sun, 8 Mar 2020 22:05:42 +0100 Subject: [PATCH 11/20] Document daemons & timers --- docs/daemons.rst | 374 +++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 2 + docs/kwargs.rst | 23 ++- docs/timers.rst | 228 +++++++++++++++++++++++++++++ 4 files changed, 626 insertions(+), 1 deletion(-) create mode 100644 docs/daemons.rst create mode 100644 docs/timers.rst diff --git a/docs/daemons.rst b/docs/daemons.rst new file mode 100644 index 00000000..6b16b7ef --- /dev/null +++ b/docs/daemons.rst @@ -0,0 +1,374 @@ +======= +Daemons +======= + +Daemons are a special type of handlers for background logic that accompanies +the Kubernetes resources during their life cycle. + +Unlike event-driven short-running handlers declared with ``@kopf.on``, +daemons are started when an object is created or an operator is started, +and are capable of running indefinitely (or infinitely) long. + +The daemons are stopped when the object is deleted +or the whole operator is exiting/restarting. + + +Spawning +======== + +To have a daemon accompanying a resource of some kind, decorate a function +with ``@kopf.daemon`` and make it run for long time or forever: + +.. code-block:: python + + import asyncio + import time + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + async def monitor_kex_async(**kwargs): + while True: + ... # check something + await asyncio.sleep(10) + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + def monitor_kex_sync(stopped, **kwargs): + while not stopped: + ... # check something + time.sleep(10) + +Synchronous functions are executed in threads, asynchronous functions are +executed directly in the asyncio event loop of the operator -- same as with +regular handlers, except that a separate thread pool is used for daemons +to prevent thread pool depletion for regular handlers. +See :doc:`async`. + + +Termination +=========== + +The daemons are terminated when either their resource is marked for deletion, +or the operator itself is exiting. + +In both cases, the daemons are requested to terminate gracefully by setting +the :kwarg:`stopped` kwarg. The synchronous daemons MUST_, and asynchronous +daemons SHOULD_ check for the value of this flag as often as possible: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + def monitor_kex(stopped, **kwargs): + while not stopped: + time.sleep(1.0) + print("We are done. Bye.") + +The asynchronous daemons can skip these checks if they define the cancellation +timeout (see below). In that case, they can expect an `asyncio.CancelledError` +to be raised at any point of their code (specifically, at any ``await`` clause): + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', cancellation_timeout=1.0) + async def monitor_kex(**kwargs): + try: + while True: + await asyncio.sleep(10) + except asyncio.CancelledError: + print("We are done. Bye.") + +With no cancellation timeout set, cancellation is not performed at all, +as it is unclear for how long should the coroutine be awaited. However, +it is cancelled when the operator exits and stops all "hung" left-over tasks +(not specifically daemons). + +.. note:: + + The MUST_ / SHOULD_ separation is due to Python has no way of terminating + a thread unless the thread exits by its own. The :kwarg:`stopped` flag + is a way to signal the thread it should exit. If :kwarg:`stopped` is not + checked, the synchronous daemons will run forever or until an error happens. + +.. _MUST: https://tools.ietf.org/rfc/rfc2119.txt +.. _SHOULD: https://tools.ietf.org/rfc/rfc2119.txt + + +Timeouts +======== + +The termination sequence parameters can be controlled when declaring a daemon: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', + cancellation_backoff=1.0, cancellation_timeout=3.0) + async def monitor_kex(stopped, **kwargs): + while not stopped: + await asyncio.sleep(1) + +There are three stages how the daemon is terminated: + +* 1. Graceful termination: + * ``stopped`` is set immediately (unconditionally). + * ``cancellation_backoff`` is awaited (if set). +* 2. Forced termination -- only if ``cancellation_timeout`` is set: + * `asyncio.CancelledError` is raised (for async daemons only). + * ``cancellation_timeout`` is awaited (if set). +* 3a. Giving up and abandoning -- only if ``cancellation_timeout`` is set: + * A `ResourceWarning` is issued for potential OS resource leaks. + * The finalizer is removed, and the object is released for potential deletion. +* 3b. Forever polling -- only if ``cancellation_timeout`` is not set: + * The daemon awaiting continues forever, logging from time to time. + * The finalizer is not removed and the object remains blocked from deletion. + +The ``cancellation_timeout`` is measured from the point when the daemon +is cancelled (forced termination begins), not from where the termination +itself begins; i.e., since the moment when the cancellation backoff is over. +The total termination time is ``cancellation_backoff + cancellation_timeout``. + +.. warning:: + + When the operator is exiting, it has its own timeout of 5 seconds + for all "hung" tasks. This includes the daemons after they are requested + to exit gracefully and all timeouts are reached. + + If the daemon termination takes longer than this for any reason, + the daemon will be cancelled (by the operator, not by the daemon guard) + regardless of the graceful timeout of the daemon. If this does not help, + the operator will be waiting for all hung tasks until SIGKILL'ed. + +.. warning:: + + If the operator is running in a cluster, there can be timeouts set for a pod + (``terminationGracePeriodSeconds``, the default is 30 seconds). + + If the daemon termination is longer than this timeout, the daemons will not + be finished in full at the operator exit, as the pod will be SIGKILL'ed. + +Kopf itself does not set any implicit timeouts for the daemons. +Either design the daemons to exit as fast as possible, or configure +``terminationGracePeriodSeconds`` and cancellation timeouts accordingly. + + +Safe sleep +========== + +For synchronous daemons, it is recommended to use ``stopped.wait()`` +instead of ``time.sleep()``: the wait will end when either the time is reached +(as with the sleep), or immediately when the stopped flag is set: + +.. code-block:: python + + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + def monitor_kex(stopped, **kwargs): + while not stopped: + stopped.wait(10) + +For asynchronous handlers, regular ``asyncio.sleep()`` should be sufficient, +as it is cancellable via `asyncio.CancelledError`. If cancellation is neither +configured nor desired, ``stopped.wait()`` can be used too (with ``await``): + +.. code-block:: python + + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + async def monitor_kex(stopped, **kwargs): + while not stopped: + await stopped.wait(10) + +This way, the daemon will exit as soon as possible when the :kwarg:`stopped`` +is set, not when the next sleep is over. Therefore, the sleeps can be of any +duration while the daemon remains terminable (leads to no OS resource leakage). + +.. note:: + + Synchronous and asynchronous daemons get different types of stop-checker: + with synchronous and asynchronous interfaces respectively. + Therefore, they should be used accordingly: without or with ``await``. + + +Postponing +========== + +Normally, daemons are spawned immediately once resource becomes visible +to the operator: i.e. on resource creation or operator startup. + +It is possible to postpone the daemon spawning: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', initial_backoff=30) + async def monitor_kex(stopped, **kwargs): + while True: + await asyncio.sleep(1.0) + + +The daemon will not be started during the specified time after it should +be started. For example, this can be used to give some time for regular +event-driven handlers to finish without producing too much activity. + + +Restarting +========== + +It is generally expected that daemons are designed to run forever. +However, it is possible for a daemon to exit prematurely, i.e. before +the resource is deleted or the operator is exiting. + +In that case, the daemon will not be restarted again during the lifecycle +of this resource in this operator process (however, it will be spawned again +if the operator restarts). This way, it becomes a long-running equivalent +of on-creation/on-resuming handlers. + +To simulate restarting, raise `kopf.TemporaryError` with a delay set. + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + async def monitor_kex(stopped, **kwargs): + await asyncio.sleep(10.0) + raise kopf.TemporaryError("Need to restart.", delay=10) + +Same as with regular error handling, a delay of ``None`` means instant restart. + + +Deletion prevention +=================== + +Normally, a finalizer is put on the resource if there are daemons running +for it -- to prevent its actual deletion until all the daemons are terminated. + +Only after the daemons are terminated, the finalizer is removed to release +the object for actual deletion. + +However, it is possible to have daemons that disobey the exiting signals +and continue running after the timeouts. In that case, the finalizer is +anyway removed, and the orphaned daemons are left to themselves. + + +Resource fields access +====================== + +The resource's current state is accessible at any time through regular kwargs +(see :doc:`kwargs`): :kwarg:`body`, :kwarg:`spec`, :kwarg:`meta`, +:kwarg:`status`, :kwarg:`uid`, :kwarg:`name`, :kwarg:`namespace`, etc. + +The values are "live views" of the current state of the object as it is being +modified during its lifecycle (not frozen as in the event-driven handlers): + +.. code-block:: python + + import random + import time + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + def monitor_kex(stopped, logger, body, spec, **kwargs): + while not stopped: + logger.info(f"FIELD={spec['field']}") + time.sleep(1) + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=2.5) + def modify_kex_sometimes(patch, **kwargs): + patch.spec['field'] = random.randint(0, 100) + +Always access the fields through the provided kwargs, and do not store +them in local variables. Internally, Kopf substitutes the whole object's +body on every external change. Storing the field values to the variables +will remember their value as it was at that moment in time, +and will not be updated as the object changes. + + +Results delivery +================ + +As with any other handlers, it is possible for the daemons to return +arbitrary JSON-serializable values to be put on the resource's status: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples') + async def monitor_kex(stopped, **kwargs): + await asyncio.sleep(10.0) + return {'finished': True} + + +Error handling +============== + +The error handling is the same as for all other handlers: see :doc:`errors`: + +.. code-block:: python + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', + errors=kopf.ErrorsMode.TEMPORARY, backoff=1, retries=10) + def monitor_kex(retry, **_): + if retry < 3: + raise kopf.TemporaryError("I'll be back!", delay=1) + elif retry < 5: + raise EnvironmentError("Something happened!") + else: + raise kopf.PermanentError("Bye-bye!") + +If a permanent error is raised, the daemon will never be restarted again. +Same as when the daemon exits on its own (but this could be reconsidered +in the future). + + +Filtering +========= + +It is also possible to use the existing :doc:`filters` +to only spawn daemons for specific resources: + +.. code-block:: python + + import time + import kopf + + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', + annotations={'some-annotation': 'some-value'}, + labels={'some-label': 'some-value'}, + when=lambda name, **_: 'some' in name) + def monitor_selected_kexes(stopped, **kwargs): + while not stopped: + time.sleep(1) + +Other (non-matching) resources of that kind will be ignored. + + +System resources +================ + +.. warning:: + + A separate OS thread or asyncio task is started + for each individual resource and each individual handler. + + Having hundreds or thousands of OS threads or asyncio tasks can consume + system resources significantly. Make sure you only have daemons and timers + with appropriate filters (e.g., by labels, annotations, or so). + + For the same reason, prefer to use async handlers (with properly designed + async/await code), since asyncio tasks are a somewhat cheaper than threads. + See :doc:`async` for details. diff --git a/docs/index.rst b/docs/index.rst index 4970434b..65b66a7f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,6 +27,8 @@ Kopf: Kubernetes Operators Framework :caption: Resource handling: handlers + daemons + timers kwargs async loading diff --git a/docs/kwargs.rst b/docs/kwargs.rst index dd02d3f5..5a46b356 100644 --- a/docs/kwargs.rst +++ b/docs/kwargs.rst @@ -23,7 +23,7 @@ Retrying and timing =================== Most (but not all) of the handlers -- such as resource change detection, -and activity handlers -- are capable of retrying +resource daemons and timers, and activity handlers -- are capable of retrying their execution in case of errors (see also: :doc:`errors`). They provide kwargs regarding the retrying process: @@ -173,3 +173,24 @@ Diffing the detected changes. The new state usually corresponds to :kwarg:`body`. ``diff`` is a list of changes of the object between old & new states. + + +Resource daemon kwargs +====================== + + +.. kwarg:: stopped + +Stop-flag +--------- + +The daemons also have ``stopped``. It is a flag object for sync daemons +to check if they should stop. See also: `DaemonStopperChecker`. + +To check, ``.is_set()`` method can be called, or the object itself can be used +as a boolean expression: e.g. ``while not stopped: ...``. + +Its ``.wait()`` method can be used to replace ``time.sleep()`` +or ``asyncio.sleep()`` for faster (instant) termination on resource deletion. + +See more: :doc:`daemons`. diff --git a/docs/timers.rst b/docs/timers.rst new file mode 100644 index 00000000..2b2d27a9 --- /dev/null +++ b/docs/timers.rst @@ -0,0 +1,228 @@ +====== +Timers +====== + +Timers are schedules of regular handler execution as long as the object exists, +no matter if there were any changes or not -- unlike the regular handlers, +which are event-driven and are triggered only when something changes. + + +Intervals +========= + +The interval defines how often to trigger the handler (in seconds): + +.. code-block:: python + + import asyncio + import time + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=1.0) + def ping_kex(spec, **kwargs): + pass + + +Sharpness +========= + +Usually (by default), the timers are invoked with the specified interval +between each call. The time taken by the handler itself is not taken into +account. It is possible to define timers with sharp schedule: i.e. invoked +every number of seconds sharp, no matter how long it takes to execute it: + +.. code-block:: python + + import asyncio + import time + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=1.0, sharp=True) + def ping_kex(spec, **kwargs): + time.sleep(0.3) + +In this example, the timer takes 0.3 seconds to execute. The actual interval +between the timers will be 0.7 seconds in the sharp mode: whatever is left +of the declared interval of 1.0 seconds minus the execution time. + + +Idling +====== + +Timers can be defined to idle if the resource changes too often, and only +be invoked when it is stable for some time: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', idle=10) + def ping_kex(spec, **kwargs): + print(f"FIELD={spec['field']}") + +Creation of a resource is considered as a change, so idling also shifts +the very first invocation by that time. + +The default is to have no idle time, just the intervals. + +It is possible to have a timer with both idling and interval. In that case, +the timer will be invoked only if there were no changes for specified duration +(idle), and every N seconds after that (interval), as long as the object does +not change. Once changed, the timer will stop and wait for the new idling time: + +.. code-block:: python + + import asyncio + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', idle=10, interval=1) + def ping_kex(spec, **kwargs): + print(f"FIELD={spec['field']}") + + +Postponing +========== + +Normally, timers are invoked immediately once resource becomes visible +to the operator (unless idling is declared). + +It is possible to postpone the invocations: + +.. code-block:: python + + import asyncio + import time + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=1, initial_backoff=5) + def ping_kex(spec, **kwargs): + print(f"FIELD={spec['field']}") + +This is similar to idling, except that it is applied only once per +resource/operator lifecycle in the very beginning. + + +Combined timing +=============== + +It is possible to combine all schedule intervals to achieve the desired effect. +For example, to give an operator 1 minute for warming up, and then pinging +the resources every 10 seconds if they are unmodified for 10 minutes: + +.. code-block:: python + + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', + initial_backoff=60, interval=10, idle=600) + def ping_kex(spec, **kwargs): + pass + + +Errors in timers +================ + +The timers follow the standard :doc:`error handling ` protocol: +:class:`TemporaryError` and arbitrary exceptions are treated according to +the ``errors``, ``timeout``, ``retries``, ``backoff`` options of the handler. +The kwargs :kwarg:`retry`, :kwarg:`started`, :kwarg:`runtime` are provided too. + +The default behaviour is to retry arbitrary error +(similar to the regular resource handlers). + +When an error happens, its delay overrides the timer's schedule or life cycle: + +* For arbitrary exceptions, the timer's ``backoff=...`` option is used. +* For `kopf.TemporaryError`, the error's ``delay=...`` option is used. +* For `kopf.PermanentError`, the timer stops forever and is never retried. + +The timer's own interval is only used of the function exits successfully. + +For example, if the handler fails 3 times with a back-off time set to 5 seconds +and the interval set to 10 seconds, it will take 25 seconds (``3*5+10``) +from the first execution to the end of the retrying cycle: + +.. code-block:: python + + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', + errors=kopf.ErrorsMode.TEMPORARY, interval=10, backoff=5) + def monitor_kex_by_time(name, retry, **kwargs): + if retry < 3: + raise Exception() + +It will be executed in that order: + +* A new cycle begins: + * 1st execution attempt fails (``retry == 0``). + * Waits for 5 seconds (``backoff``). + * 2nd execution attempt fails (``retry == 1``). + * Waits for 5 seconds (``backoff``). + * 3rd execution attempt fails (``retry == 2``). + * Waits for 5 seconds (``backoff``). + * 4th execution attempt succeeds (``retry == 3``). + * Waits for 10 seconds (``interval``). +* A new cycle begins: + * 5th execution attempt fails (``retry == 0``). + +The timer never overlaps with itself. Though, multiple timers with +different interval settings and execution schedules can eventually overlap +with each other and with event-driven handlers. + + +Results delivery +================ + +The timers follow the standard :doc:`results delivery ` protocol: +the returned values are put on the object's status under the handler's id +as a key. + +.. code-block:: python + + import random + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=10) + def ping_kex(spec, **kwargs): + return random.randint(0, 100) + +.. note:: + + Whenever a resulting value is serialised and put on the resource's status, + it modifies the resource, which, in turn, resets the idle timer. + Use carefully with both idling & returned results. + + +Filtering +========= + +It is also possible to use the existing :doc:`filters`: + +.. code-block:: python + + import kopf + + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=10, + annotations={'some-annotation': 'some-value'}, + labels={'some-label': 'some-value'}, + when=lambda name, **_: 'some' in name) + def ping_kex(spec, **kwargs): + pass + + +System resources +================ + +.. warning:: + + Timers are implemented the same way as asynchronous daemons + (see :doc:`daemons`) — via asyncio tasks for every resource & handler. + + Despite OS threads are not involved until the synchronous functions + are invoked (through the asyncio executors), this can lead to significant + OS resource usage on large clusters with thousands of resources. + + Make sure you only have daemons and timers with appropriate filters + (e.g., by labels, annotations, or so). From d4ac59846cfe29a88d7f655decb84be1367ce694 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Mon, 9 Mar 2020 01:46:03 +0100 Subject: [PATCH 12/20] Document reconciliation recipes (overview with links) --- docs/index.rst | 1 + docs/reconciliation.rst | 86 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 docs/reconciliation.rst diff --git a/docs/index.rst b/docs/index.rst index 65b66a7f..215133f6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -64,6 +64,7 @@ Kopf: Kubernetes Operators Framework deployment continuity idempotence + reconciliation troubleshooting .. toctree:: diff --git a/docs/reconciliation.rst b/docs/reconciliation.rst new file mode 100644 index 00000000..491a904d --- /dev/null +++ b/docs/reconciliation.rst @@ -0,0 +1,86 @@ +============== +Reconciliation +============== + +Reconciliation is, in plain words, bringing the *actual state* of a system +to a *desired state* as expressed by the Kubernetes resources. +For example, starting as many pods, as it is declared in a deployment, +especially when this declaration changes due to resource updates. + +Kopf is not an operator, it is a framework to make operators. +Therefore, it knows nothing about the *desired state* or *actual state* +(or any *state* at all). + +Kopf-based operators must implement the checks and reactions to the changes, +so that both states are synchronised according to the operator's concepts. + +Kopf only provides few ways and tools of achieving this easily. + + +Event-driven reactions +====================== + +Normally, Kopf triggers the on-creation/on-update/on-deletion handlers +every time anything changes on the object, as reported by Kubernetes API. +It provides both the current state of the object, and a diff with the last +handled state. + +The event-driven approach is the best, as it saves system resources (RAM & CPU), +and does not trigger any activity when it is not needed, and does not consume +memory for keeping the object's last known state permanently in memory. + +But it is more difficult to develop, and is not suitable for some cases: +e.g., when an external non-Kubernetes system is monitored via its own API. + +.. seealso:: + :doc:`handlers` + + +Regularly scheduled timers +========================== + +Timers are triggered on regular schedule, regardless of whether anything +changes or does not change in the resource itself. This can be used to +verify both the resource's body, and the state of other related resources +though API calls, and update the original resource's status/content. + +.. seealso:: + :doc:`timers` + + +Permanently running daemons +=========================== + +As a last resort, a developer can implement their own background task, +which checks the status of the system and reacts when the "actual" state +diverts from the "desired" state. + +.. seealso:: + :doc:`daemons` + + +What to use when? +================= + +As a rule of thumb _(recommended, but not insisted)_, the following guidelines +can be used to decide which way of reconciliation to use in which cases: + +* In the first place, try the event-driven approach by watching + for the children resources (those belonging to the "actual" state). + + If there are many children resources for one parent resource, + store their brief statuses on the parent's ``status.children.{id}`` + from every individual child, and react to the changes of ``status.children`` + in the parent resource. + +* If the "desired" state can be queried with blocking waits + (e.g. by running a ``GET`` query on a remote job/task/activity via an API, + which blocks until the requested condition is reached), + then use daemons to poll for the status, and process it as soon as it changes. + +* If the "desired" state is not Kubernetes-related, maybe it is an external + system accessed by an API, or if delays in reconciliation are acceptable, + then use the timers. + +* Only as the last resort, use the daemons with a ``while True`` cycle + and explicit sleep. From 86e173443648cef501bb0ac58b75c6e974c7770e Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Mon, 9 Mar 2020 01:54:02 +0100 Subject: [PATCH 13/20] Add examples of daemons & timers --- examples/14-daemons/example.py | 37 ++++++++++++++++++++++++++++++++++ examples/15-timers/example.py | 11 ++++++++++ 2 files changed, 48 insertions(+) create mode 100644 examples/14-daemons/example.py create mode 100644 examples/15-timers/example.py diff --git a/examples/14-daemons/example.py b/examples/14-daemons/example.py new file mode 100644 index 00000000..f0a0116d --- /dev/null +++ b/examples/14-daemons/example.py @@ -0,0 +1,37 @@ +import asyncio +import time + +import kopf + + +# Sync daemons in threads are non-interruptable, they must check for the `stopped` flag. +# This daemon exits after 3 attempts and then 30 seconds of running (unless stopped). +@kopf.daemon('zalando.org', 'v1', 'kopfexamples', backoff=3) +def background_sync(spec, stopped, logger, retry, patch, **_): + if retry < 3: + patch.status['message'] = f"Failed {retry+1} times." + raise kopf.TemporaryError("Simulated failure.", delay=1) + + started = time.time() + while not stopped and time.time() - started <= 30: + logger.info(f"=> Ping from a sync daemon: field={spec['field']!r}, retry={retry}") + stopped.wait(5.0) + + patch.status['message'] = "Accompanying is finished." + + +# Async daemons do not need the `stopped` signal, they can rely on `asyncio.CancelledError` raised. +# This daemon runs forever (until stopped, i.e. cancelled). Yet fails to start for 3 first times. +@kopf.daemon('zalando.org', 'v1', 'kopfexamples', backoff=3, + cancellation_backoff=1.0, cancellation_timeout=0.5) +async def background_async(spec, logger, retry, **_): + if retry < 3: + raise kopf.TemporaryError("Simulated failure.", delay=1) + + while True: + logger.info(f"=> Ping from an async daemon: field={spec['field']!r}") + await asyncio.sleep(5.0) + + +E2E_CREATION_STOP_WORDS = ["=> Ping from"] +E2E_DELETION_STOP_WORDS = ["'background_async' is cancelled", "'background_sync' is cancelled", "'background_async' has exited"] diff --git a/examples/15-timers/example.py b/examples/15-timers/example.py new file mode 100644 index 00000000..72753b53 --- /dev/null +++ b/examples/15-timers/example.py @@ -0,0 +1,11 @@ +import kopf + + +@kopf.timer('zalando.org', 'v1', 'kopfexamples', idle=5, interval=2) +def every_few_seconds_sync(spec, logger, **_): + logger.info(f"Ping from a sync timer: field={spec['field']!r}") + + +@kopf.timer('zalando.org', 'v1', 'kopfexamples', idle=10, interval=4) +async def every_few_seconds_async(spec, logger, **_): + logger.info(f"Ping from an async timer: field={spec['field']!r}") From c44d1e6396ef0efe103ac025c031fe122ec77fdb Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Wed, 18 Mar 2020 23:51:27 +0100 Subject: [PATCH 14/20] Stop mocking the real sleep when it is switched to the sleeping engine The real sleep is needed in some cases to give control to asyncio's event loop (`sleep(0)`), or to debug the tests. When it is mocked, it is difficult to do anything. All the code uses the sleeping engine now, so there is no need to check for the actual `asyncio.sleep()` calls. --- tests/handling/conftest.py | 2 -- tests/handling/test_cause_handling.py | 7 ------- tests/handling/test_no_handlers.py | 1 - tests/handling/test_timing_consistency.py | 1 - 4 files changed, 11 deletions(-) diff --git a/tests/handling/conftest.py b/tests/handling/conftest.py index a94b5bd0..c31861ca 100644 --- a/tests/handling/conftest.py +++ b/tests/handling/conftest.py @@ -47,7 +47,6 @@ class K8sMocks: patch_obj: Mock post_event: Mock - asyncio_sleep: Mock sleep_or_wait: Mock @@ -57,7 +56,6 @@ def k8s_mocked(mocker, resp_mocker): return K8sMocks( patch_obj=mocker.patch('kopf.clients.patching.patch_obj'), post_event=mocker.patch('kopf.clients.events.post_event'), - asyncio_sleep=mocker.patch('asyncio.sleep'), sleep_or_wait=mocker.patch('kopf.engines.sleeping.sleep_or_wait', return_value=None), ) diff --git a/tests/handling/test_cause_handling.py b/tests/handling/test_cause_handling.py index b759d918..69f44055 100644 --- a/tests/handling/test_cause_handling.py +++ b/tests/handling/test_cause_handling.py @@ -36,7 +36,6 @@ async def test_create(registry, settings, handlers, resource, cause_mock, event_ assert not handlers.update_mock.called assert not handlers.delete_mock.called - assert k8s_mocked.asyncio_sleep.call_count == 0 assert k8s_mocked.sleep_or_wait.call_count == 0 assert k8s_mocked.patch_obj.call_count == 1 assert not event_queue.empty() @@ -78,7 +77,6 @@ async def test_update(registry, settings, handlers, resource, cause_mock, event_ assert handlers.update_mock.call_count == 1 assert not handlers.delete_mock.called - assert k8s_mocked.asyncio_sleep.call_count == 0 assert k8s_mocked.sleep_or_wait.call_count == 0 assert k8s_mocked.patch_obj.call_count == 1 assert not event_queue.empty() @@ -121,7 +119,6 @@ async def test_delete(registry, settings, handlers, resource, cause_mock, event_ assert not handlers.update_mock.called assert handlers.delete_mock.call_count == 1 - assert k8s_mocked.asyncio_sleep.call_count == 0 assert k8s_mocked.sleep_or_wait.call_count == 0 assert k8s_mocked.patch_obj.call_count == 1 assert not event_queue.empty() @@ -165,8 +162,6 @@ async def test_gone(registry, settings, handlers, resource, cause_mock, event_ty assert not handlers.update_mock.called assert not handlers.delete_mock.called - assert not k8s_mocked.asyncio_sleep.called - assert not k8s_mocked.sleep_or_wait.called assert not k8s_mocked.patch_obj.called assert event_queue.empty() @@ -197,7 +192,6 @@ async def test_free(registry, settings, handlers, resource, cause_mock, event_ty assert not handlers.update_mock.called assert not handlers.delete_mock.called - assert not k8s_mocked.asyncio_sleep.called assert not k8s_mocked.sleep_or_wait.called assert not k8s_mocked.patch_obj.called assert event_queue.empty() @@ -229,7 +223,6 @@ async def test_noop(registry, settings, handlers, resource, cause_mock, event_ty assert not handlers.update_mock.called assert not handlers.delete_mock.called - assert not k8s_mocked.asyncio_sleep.called assert not k8s_mocked.sleep_or_wait.called assert not k8s_mocked.patch_obj.called assert event_queue.empty() diff --git a/tests/handling/test_no_handlers.py b/tests/handling/test_no_handlers.py index 53b6baa4..415c5858 100644 --- a/tests/handling/test_no_handlers.py +++ b/tests/handling/test_no_handlers.py @@ -40,7 +40,6 @@ async def test_skipped_with_no_handlers( event_queue=asyncio.Queue(), ) - assert not k8s_mocked.asyncio_sleep.called assert not k8s_mocked.sleep_or_wait.called assert k8s_mocked.patch_obj.called diff --git a/tests/handling/test_timing_consistency.py b/tests/handling/test_timing_consistency.py index 2cc1f795..f7d17e06 100644 --- a/tests/handling/test_timing_consistency.py +++ b/tests/handling/test_timing_consistency.py @@ -71,6 +71,5 @@ def move_to_tsB(*_, **__): # Without "now"-time consistency, neither sleep() would be called, nor a patch applied. # Verify that the patch was actually applied, so that the reaction cycle continues. - assert not k8s_mocked.asyncio_sleep.called assert k8s_mocked.patch_obj.called assert 'dummy' in k8s_mocked.patch_obj.call_args_list[-1][1]['patch']['status']['kopf'] From 2eb91dcadeb52dc766ef6d961546cfecfe889b18 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Mon, 30 Mar 2020 23:23:23 +0200 Subject: [PATCH 15/20] Tests for daemons & timers (only smoke-tests, not much detailed) --- tests/handling/daemons/conftest.py | 105 ++++++++++ tests/handling/daemons/test_daemon_errors.py | 181 +++++++++++++++++ .../daemons/test_daemon_filtration.py | 62 ++++++ .../handling/daemons/test_daemon_spawning.py | 44 +++++ .../daemons/test_daemon_termination.py | 136 +++++++++++++ tests/handling/daemons/test_timer_errors.py | 186 ++++++++++++++++++ .../handling/daemons/test_timer_filtration.py | 62 ++++++ .../handling/daemons/test_timer_intervals.py | 58 ++++++ .../handling/daemons/test_timer_triggering.py | 52 +++++ 9 files changed, 886 insertions(+) create mode 100644 tests/handling/daemons/conftest.py create mode 100644 tests/handling/daemons/test_daemon_errors.py create mode 100644 tests/handling/daemons/test_daemon_filtration.py create mode 100644 tests/handling/daemons/test_daemon_spawning.py create mode 100644 tests/handling/daemons/test_daemon_termination.py create mode 100644 tests/handling/daemons/test_timer_errors.py create mode 100644 tests/handling/daemons/test_timer_filtration.py create mode 100644 tests/handling/daemons/test_timer_intervals.py create mode 100644 tests/handling/daemons/test_timer_triggering.py diff --git a/tests/handling/daemons/conftest.py b/tests/handling/daemons/conftest.py new file mode 100644 index 00000000..f21eae60 --- /dev/null +++ b/tests/handling/daemons/conftest.py @@ -0,0 +1,105 @@ +import asyncio +import time +import unittest.mock + +import freezegun +import pytest + +import kopf +from kopf.reactor.processing import process_resource_event +from kopf.structs.bodies import RawBody +from kopf.structs.containers import ResourceMemories + + +class DaemonDummy: + + def __init__(self): + super().__init__() + self.mock = unittest.mock.MagicMock() + self.kwargs = {} + self.steps = { + 'called': asyncio.Event(), + 'finish': asyncio.Event(), + 'error': asyncio.Event(), + } + + async def wait_for_daemon_done(self): + stopped = self.kwargs['stopped'] + await stopped.wait() + while not stopped._stopper.reason & stopped._stopper.reason.DONE: + await asyncio.sleep(0) # give control back to asyncio event loop + + +@pytest.fixture() +def dummy(): + return DaemonDummy() + + +@pytest.fixture() +def memories(): + return ResourceMemories() + + +@pytest.fixture() +def simulate_cycle(k8s_mocked, registry, settings, resource, memories, mocker): + """ + Simulate K8s behaviour locally in memory (some meaningful approximation). + """ + + def _merge_dicts(src, dst): + for key, val in src.items(): + if isinstance(val, dict) and key in dst: + _merge_dicts(src[key], dst[key]) + else: + dst[key] = val + + async def _simulate_cycle(event_object: RawBody): + mocker.resetall() + + await process_resource_event( + lifecycle=kopf.lifecycles.all_at_once, + registry=registry, + settings=settings, + resource=resource, + memories=memories, + raw_event={'type': 'irrelevant', 'object': event_object}, + replenished=asyncio.Event(), + event_queue=asyncio.Queue(), + ) + + # Do the same as k8s does: merge the patches into the object. + for call in k8s_mocked.patch_obj.call_args_list: + _merge_dicts(call[1]['patch'], event_object) + + return _simulate_cycle + + +@pytest.fixture() +def frozen_time(): + """ + A helper to simulate time movements to step over long sleeps/timeouts. + """ + # TODO LATER: Either freezegun should support the system clock, or find something else. + with freezegun.freeze_time("2020-01-01 00:00:00") as frozen: + # Use freezegun-supported time instead of system clocks -- for testing purposes only. + # NB: Patch strictly after the time is frozen -- to use fake_time(), not real time(). + with unittest.mock.patch('time.monotonic', time.time), \ + unittest.mock.patch('time.perf_counter', time.time): + yield frozen + + +# The time-driven tests mock the sleeps, and shift the time as much as it was requested to sleep. +# This makes the sleep realistic for the app code, though executed instantly for the tests. +@pytest.fixture() +def manual_time(k8s_mocked, frozen_time): + async def sleep_or_wait_substitute(delay, *_, **__): + if delay is None: + pass + elif isinstance(delay, float): + frozen_time.tick(delay) + else: + frozen_time.tick(min(delay)) + + k8s_mocked.sleep_or_wait.side_effect = sleep_or_wait_substitute + yield frozen_time + diff --git a/tests/handling/daemons/test_daemon_errors.py b/tests/handling/daemons/test_daemon_errors.py new file mode 100644 index 00000000..f5361434 --- /dev/null +++ b/tests/handling/daemons/test_daemon_errors.py @@ -0,0 +1,181 @@ +import logging + +import kopf +from kopf.storage.finalizers import FINALIZER + + +async def test_daemon_stopped_on_permanent_error( + registry, resource, dummy, manual_time, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + backoff=0.01) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + raise kopf.PermanentError("boo!") + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 1 + assert k8s_mocked.patch_obj.call_count == 0 + assert k8s_mocked.sleep_or_wait.call_count == 1 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] is None + + assert_logs([ + "Daemon 'fn' failed permanently: boo!", + "Daemon 'fn' has exited on its own", + ], prohibited=[ + "Daemon 'fn' succeeded.", + ]) + + +async def test_daemon_stopped_on_arbitrary_errors_with_mode_permanent( + registry, resource, dummy, manual_time, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + errors=kopf.ErrorsMode.PERMANENT, backoff=0.01) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + raise Exception("boo!") + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_count == 1 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] is None + + assert_logs([ + "Daemon 'fn' failed with an exception. Will stop.", + "Daemon 'fn' has exited on its own", + ], prohibited=[ + "Daemon 'fn' succeeded.", + ]) + + +async def test_daemon_retried_on_temporary_error( + registry, settings, resource, dummy, manual_time, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + backoff=1.0) + async def fn(retry, **kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if not retry: + raise kopf.TemporaryError("boo!", delay=1.0) + else: + dummy.steps['finish'].set() + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.steps['finish'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 2 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] is None + + assert_logs([ + "Daemon 'fn' failed temporarily: boo!", + "Daemon 'fn' succeeded.", + "Daemon 'fn' has exited on its own", + ]) + + +async def test_daemon_retried_on_arbitrary_error_with_mode_temporary( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + errors=kopf.ErrorsMode.TEMPORARY, backoff=1.0) + async def fn(retry, **kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if not retry: + raise Exception("boo!") + else: + dummy.steps['finish'].set() + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.steps['finish'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 2 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] is None + + assert_logs([ + "Daemon 'fn' failed with an exception. Will retry.", + "Daemon 'fn' succeeded.", + "Daemon 'fn' has exited on its own", + ]) + + +async def test_daemon_retried_until_retries_limit( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + retries=3) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + raise kopf.TemporaryError("boo!", delay=1.0) + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 4 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[2][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[3][0][0] is None + + +async def test_daemon_retried_until_timeout( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + timeout=3.0) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + raise kopf.TemporaryError("boo!", delay=1.0) + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 4 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[2][0][0] == 1.0 # [call#][args/kwargs][arg#] + assert k8s_mocked.sleep_or_wait.call_args_list[3][0][0] is None diff --git a/tests/handling/daemons/test_daemon_filtration.py b/tests/handling/daemons/test_daemon_filtration.py new file mode 100644 index 00000000..18838395 --- /dev/null +++ b/tests/handling/daemons/test_daemon_filtration.py @@ -0,0 +1,62 @@ +import logging + +import pytest + +import kopf +from kopf.storage.finalizers import FINALIZER + + +# We assume that the handler filtering is tested in details elsewhere (for all handlers). +# Here, we only test if it is applied or not applied. + + +async def test_daemon_filtration_satisfied( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + labels={'a': 'value', 'b': kopf.PRESENT, 'c': kopf.ABSENT}, + annotations={'x': 'value', 'y': kopf.PRESENT, 'z': kopf.ABSENT}) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + + event_body = {'metadata': {'labels': {'a': 'value', 'b': '...'}, + 'annotations': {'x': 'value', 'y': '...'}, + 'finalizers': [FINALIZER]}} + await simulate_cycle(event_body) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + +@pytest.mark.parametrize('labels, annotations', [ + # Annotations mismatching (but labels are matching): + ({'a': 'value', 'b': '...'}, {'x': 'mismatching-value', 'b': '...'}, ), # x must be "value". + ({'a': 'value', 'b': '...'}, {'x': 'value', 'y': '...', 'z': '...'}), # z must be absent + ({'a': 'value', 'b': '...'}, {'x': 'value'}), # y must be present + # labels mismatching (but annotations are matching): + ({'a': 'mismatching-value', 'b': '...'}, {'x': 'value', 'y': '...'}), + ({'a': 'value', 'b': '...', 'c': '...'}, {'x': 'value', 'y': '...'}), + ({'a': 'value'}, {'x': 'value', 'y': '...'}), +]) +async def test_daemon_filtration_mismatched( + registry, resource, mocker, labels, annotations, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + spawn_resource_daemons = mocker.patch('kopf.reactor.daemons.spawn_resource_daemons') + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + labels={'a': 'value', 'b': kopf.PRESENT, 'c': kopf.ABSENT}, + annotations={'x': 'value', 'y': kopf.PRESENT, 'z': kopf.ABSENT}) + async def fn(**kwargs): + pass + + event_body = {'metadata': {'labels': labels, + 'annotations': annotations, + 'finalizers': [FINALIZER]}} + await simulate_cycle(event_body) + + assert spawn_resource_daemons.called + assert spawn_resource_daemons.call_args_list[0][1]['handlers'] == [] diff --git a/tests/handling/daemons/test_daemon_spawning.py b/tests/handling/daemons/test_daemon_spawning.py new file mode 100644 index 00000000..9539f15e --- /dev/null +++ b/tests/handling/daemons/test_daemon_spawning.py @@ -0,0 +1,44 @@ +import logging + +import kopf + + +async def test_daemon_is_spawned_at_least_once( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn') + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + + await simulate_cycle({}) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 1 # not restarted + + +async def test_daemon_initial_backoff_obeyed( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + initial_backoff=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + + await simulate_cycle({}) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count >= 1 + assert k8s_mocked.sleep_or_wait.call_count <= 2 # one optional extra call for sleep(None) + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 # [call#][args/kwargs][arg#] diff --git a/tests/handling/daemons/test_daemon_termination.py b/tests/handling/daemons/test_daemon_termination.py new file mode 100644 index 00000000..b6442af7 --- /dev/null +++ b/tests/handling/daemons/test_daemon_termination.py @@ -0,0 +1,136 @@ +import asyncio +import logging + +import pytest + +import kopf +from kopf.storage.finalizers import FINALIZER + + +async def test_daemon_exits_gracefully_and_instantly_via_stopper( + registry, resource, dummy, simulate_cycle, + caplog, assert_logs, k8s_mocked, frozen_time, mocker, timer): + caplog.set_level(logging.DEBUG) + + # A daemon-under-test. + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn') + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + await kwargs['stopped'].wait() + + # 0th cycle:tTrigger spawning and wait until ready. Assume the finalizers are already added. + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + await dummy.steps['called'].wait() + + # 1st stage: trigger termination due to resource deletion. + mocker.resetall() + event_object.setdefault('metadata', {}).update({'deletionTimestamp': '...'}) + await simulate_cycle(event_object) + + # Check that the daemon has exited near-instantly, with no delays. + with timer: + await dummy.wait_for_daemon_done() + + assert timer.seconds < 0.01 # near-instantly + assert k8s_mocked.sleep_or_wait.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] is None + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['metadata']['finalizers'] == [] + + +async def test_daemon_exits_via_cancellation_with_backoff( + registry, resource, dummy, simulate_cycle, + caplog, assert_logs, k8s_mocked, frozen_time, mocker): + caplog.set_level(logging.DEBUG) + + # A daemon-under-test. + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + cancellation_backoff=5, cancellation_timeout=10) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + await asyncio.Event().wait() # this one is cancelled. + + # Trigger spawning and wait until ready. Assume the finalizers are already added. + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + await dummy.steps['called'].wait() + + # 1st stage: trigger termination due to resource deletion. Wait for backoff. + mocker.resetall() + event_object.setdefault('metadata', {}).update({'deletionTimestamp': '...'}) + await simulate_cycle(event_object) + + assert k8s_mocked.sleep_or_wait.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 5.0 + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['status']['kopf']['dummy'] + + # 2nd cycle: cancelling after the backoff is reached. Wait for cancellation timeout. + mocker.resetall() + frozen_time.tick(5) # backoff time or slightly above it + await simulate_cycle(event_object) + + assert k8s_mocked.sleep_or_wait.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 10.0 + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['status']['kopf']['dummy'] + + # 3rd cycle: the daemon has exited, the resource should be unblocked from actual deletion. + mocker.resetall() + frozen_time.tick(1) # any time below timeout + await simulate_cycle(event_object) + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 0 + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['metadata']['finalizers'] == [] + + +async def test_daemon_is_abandoned_due_to_cancellation_timeout_reached( + registry, resource, dummy, simulate_cycle, + caplog, assert_logs, k8s_mocked, frozen_time, mocker): + caplog.set_level(logging.DEBUG) + + # A daemon-under-test. + @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', + cancellation_timeout=10) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + try: + await dummy.steps['finish'].wait() # this one is cancelled. + except asyncio.CancelledError: + await dummy.steps['finish'].wait() # simulated disobedience to be cancelled. + + # 0th cycle:tTrigger spawning and wait until ready. Assume the finalizers are already added. + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + await dummy.steps['called'].wait() + + # 1st stage: trigger termination due to resource deletion. Wait for backoff. + mocker.resetall() + event_object.setdefault('metadata', {}).update({'deletionTimestamp': '...'}) + await simulate_cycle(event_object) + + assert k8s_mocked.sleep_or_wait.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 10.0 + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['status']['kopf']['dummy'] + + # 2rd cycle: the daemon has exited, the resource should be unblocked from actual deletion. + mocker.resetall() + frozen_time.tick(50) + with pytest.warns(ResourceWarning, match=r"Daemon .+ did not exit in time"): + await simulate_cycle(event_object) + + assert k8s_mocked.sleep_or_wait.call_count == 0 + assert k8s_mocked.patch_obj.call_count == 1 + assert k8s_mocked.patch_obj.call_args_list[0][1]['patch']['metadata']['finalizers'] == [] + assert_logs(["Daemon 'fn' did not exit in time. Leaving it orphaned."]) + + # Cleanup. + dummy.steps['finish'].set() + await dummy.wait_for_daemon_done() diff --git a/tests/handling/daemons/test_timer_errors.py b/tests/handling/daemons/test_timer_errors.py new file mode 100644 index 00000000..fb579618 --- /dev/null +++ b/tests/handling/daemons/test_timer_errors.py @@ -0,0 +1,186 @@ +import logging + +import kopf +from kopf.storage.finalizers import FINALIZER + + +async def test_timer_stopped_on_permanent_error( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + backoff=0.01, interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + raise kopf.PermanentError("boo!") + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_count == 1 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 + + assert_logs([ + "Timer 'fn' failed permanently: boo!", + ], prohibited=[ + "Timer 'fn' succeeded.", + ]) + + +async def test_timer_stopped_on_arbitrary_errors_with_mode_permanent( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + errors=kopf.ErrorsMode.PERMANENT, backoff=0.01, interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + raise Exception("boo!") + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_count == 1 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 + + assert_logs([ + "Timer 'fn' failed with an exception. Will stop.", + ], prohibited=[ + "Timer 'fn' succeeded.", + ]) + + +async def test_timer_retried_on_temporary_error( + registry, settings, resource, dummy, manual_time, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + backoff=1.0, interval=1.0) + async def fn(retry, **kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if not retry: + raise kopf.TemporaryError("boo!", delay=1.0) + else: + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + dummy.steps['finish'].set() + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.steps['finish'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 2 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 # interval + + assert_logs([ + "Timer 'fn' failed temporarily: boo!", + "Timer 'fn' succeeded.", + ]) + + +async def test_timer_retried_on_arbitrary_error_with_mode_temporary( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + errors=kopf.ErrorsMode.TEMPORARY, backoff=1.0, interval=1.0) + async def fn(retry, **kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if not retry: + raise Exception("boo!") + else: + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + dummy.steps['finish'].set() + + event_object = {'metadata': {'finalizers': [FINALIZER]}} + await simulate_cycle(event_object) + + await dummy.steps['called'].wait() + await dummy.steps['finish'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count == 2 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 # interval + + assert_logs([ + "Timer 'fn' failed with an exception. Will retry.", + "Timer 'fn' succeeded.", + ]) + + +async def test_timer_retried_until_retries_limit( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + retries=3, interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if dummy.mock.call_count >= 5: + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + raise kopf.TemporaryError("boo!", delay=1.0) + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count >= 4 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[2][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[3][0][0] == 1.0 # interval + + +async def test_timer_retried_until_timeout( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, manual_time): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + timeout=3.0, interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + if dummy.mock.call_count >= 5: + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + raise kopf.TemporaryError("boo!", delay=1.0) + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert k8s_mocked.sleep_or_wait.call_count >= 4 # one for each retry + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[2][0][0] == [1.0] # delays + assert k8s_mocked.sleep_or_wait.call_args_list[3][0][0] == 1.0 # interval diff --git a/tests/handling/daemons/test_timer_filtration.py b/tests/handling/daemons/test_timer_filtration.py new file mode 100644 index 00000000..8bf55e01 --- /dev/null +++ b/tests/handling/daemons/test_timer_filtration.py @@ -0,0 +1,62 @@ +import logging + +import pytest + +import kopf +from kopf.storage.finalizers import FINALIZER + + +# We assume that the handler filtering is tested in details elsewhere (for all handlers). +# Here, we only test if it is applied or not applied. + + +async def test_timer_filtration_satisfied( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + labels={'a': 'value', 'b': kopf.PRESENT, 'c': kopf.ABSENT}, + annotations={'x': 'value', 'y': kopf.PRESENT, 'z': kopf.ABSENT}) + async def fn(**kwargs): + dummy.kwargs = kwargs + dummy.steps['called'].set() + + event_body = {'metadata': {'labels': {'a': 'value', 'b': '...'}, + 'annotations': {'x': 'value', 'y': '...'}, + 'finalizers': [FINALIZER]}} + await simulate_cycle(event_body) + + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + +@pytest.mark.parametrize('labels, annotations', [ + # Annotations mismatching (but labels are matching): + ({'a': 'value', 'b': '...'}, {'x': 'mismatching-value', 'b': '...'}, ), # x must be "value". + ({'a': 'value', 'b': '...'}, {'x': 'value', 'y': '...', 'z': '...'}), # z must be absent + ({'a': 'value', 'b': '...'}, {'x': 'value'}), # y must be present + # labels mismatching (but annotations are matching): + ({'a': 'mismatching-value', 'b': '...'}, {'x': 'value', 'y': '...'}), + ({'a': 'value', 'b': '...', 'c': '...'}, {'x': 'value', 'y': '...'}), + ({'a': 'value'}, {'x': 'value', 'y': '...'}), +]) +async def test_timer_filtration_mismatched( + registry, resource, mocker, labels, annotations, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + spawn_resource_daemons = mocker.patch('kopf.reactor.daemons.spawn_resource_daemons') + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + labels={'a': 'value', 'b': kopf.PRESENT, 'c': kopf.ABSENT}, + annotations={'x': 'value', 'y': kopf.PRESENT, 'z': kopf.ABSENT}) + async def fn(**kwargs): + pass + + event_body = {'metadata': {'labels': labels, + 'annotations': annotations, + 'finalizers': [FINALIZER]}} + await simulate_cycle(event_body) + + assert spawn_resource_daemons.called + assert spawn_resource_daemons.call_args_list[0][1]['handlers'] == [] diff --git a/tests/handling/daemons/test_timer_intervals.py b/tests/handling/daemons/test_timer_intervals.py new file mode 100644 index 00000000..2e3c52e9 --- /dev/null +++ b/tests/handling/daemons/test_timer_intervals.py @@ -0,0 +1,58 @@ +import logging + +import kopf + +# TODO: tests for idle= (more complicated) + + +async def test_timer_regular_interval( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, frozen_time): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + interval=1.0, sharp=False) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + frozen_time.tick(0.3) + if dummy.mock.call_count >= 2: + dummy.steps['finish'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 2 + assert k8s_mocked.sleep_or_wait.call_count == 2 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 + + +async def test_timer_sharp_interval( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle, frozen_time): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + interval=1.0, sharp=True) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + frozen_time.tick(0.3) + if dummy.mock.call_count >= 2: + dummy.steps['finish'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + + await simulate_cycle({}) + await dummy.steps['called'].wait() + await dummy.steps['finish'].wait() + await dummy.wait_for_daemon_done() + + assert dummy.mock.call_count == 2 + assert k8s_mocked.sleep_or_wait.call_count == 2 + assert 0.7 <= k8s_mocked.sleep_or_wait.call_args_list[0][0][0] < 0.71 + assert 0.7 <= k8s_mocked.sleep_or_wait.call_args_list[1][0][0] < 0.71 diff --git a/tests/handling/daemons/test_timer_triggering.py b/tests/handling/daemons/test_timer_triggering.py new file mode 100644 index 00000000..86fd755f --- /dev/null +++ b/tests/handling/daemons/test_timer_triggering.py @@ -0,0 +1,52 @@ +import logging + +import kopf + + +async def test_timer_is_spawned_at_least_once( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + + await simulate_cycle({}) + await dummy.steps['called'].wait() + + assert dummy.mock.call_count == 1 + assert dummy.kwargs['retry'] == 0 + assert k8s_mocked.sleep_or_wait.call_count == 1 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 1.0 + + await dummy.wait_for_daemon_done() + + +async def test_timer_initial_backoff_obeyed( + registry, resource, dummy, + caplog, assert_logs, k8s_mocked, simulate_cycle): + caplog.set_level(logging.DEBUG) + + @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', + initial_backoff=5.0, interval=1.0) + async def fn(**kwargs): + dummy.mock() + dummy.kwargs = kwargs + dummy.steps['called'].set() + kwargs['stopped']._stopper.set(reason=kopf.DaemonStoppingReason.NONE) # to exit the cycle + + await simulate_cycle({}) + await dummy.steps['called'].wait() + + assert dummy.mock.call_count == 1 + assert dummy.kwargs['retry'] == 0 + assert k8s_mocked.sleep_or_wait.call_count == 2 + assert k8s_mocked.sleep_or_wait.call_args_list[0][0][0] == 5.0 + assert k8s_mocked.sleep_or_wait.call_args_list[1][0][0] == 1.0 + + await dummy.wait_for_daemon_done() From 11ddb76fabaa3c275d597f6bc7b894ffa1dc00c1 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Tue, 31 Mar 2020 13:00:57 +0200 Subject: [PATCH 16/20] Improve logging when a daemon exits gracefully --- kopf/reactor/daemons.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kopf/reactor/daemons.py b/kopf/reactor/daemons.py index 81a0fd09..b9dac966 100644 --- a/kopf/reactor/daemons.py +++ b/kopf/reactor/daemons.py @@ -234,11 +234,11 @@ async def stop_daemon( raise RuntimeError(f"Unsupported daemon handler: {daemon.handler!r}") # Whatever happens with other flags & logs & timings, this flag must be surely set. + # It might be so, that the daemon exits instantly (if written properly: give it chance). daemon.stopper.set(reason=primitives.DaemonStoppingReason.OPERATOR_EXITING) - - # It might be so, that the daemon exits instantly (if written properly). - # Avoid waiting and cancelling below: just give the asyncio event loop an extra cycle. - await asyncio.sleep(0) + await asyncio.sleep(0) # give a chance to exit gracefully if it can. + if daemon.task.done(): + daemon.logger.debug(f"Daemon {daemon_id!r} has exited gracefully.") # Try different approaches to exiting the daemon based on timings. if not daemon.task.done() and backoff is not None: From 7b3a8cc7823c6fddfba0624ae545de0b85189cd9 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Tue, 31 Mar 2020 17:26:06 +0200 Subject: [PATCH 17/20] Rename `initial_backoff` to `initial_delay` for clarify --- docs/daemons.rst | 2 +- docs/timers.rst | 4 ++-- kopf/on.py | 8 ++++---- kopf/reactor/daemons.py | 8 ++++---- kopf/structs/handlers.py | 2 +- tests/handling/daemons/test_daemon_spawning.py | 4 ++-- tests/handling/daemons/test_timer_triggering.py | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/daemons.rst b/docs/daemons.rst index 6b16b7ef..15f1f350 100644 --- a/docs/daemons.rst +++ b/docs/daemons.rst @@ -211,7 +211,7 @@ It is possible to postpone the daemon spawning: import asyncio import kopf - @kopf.daemon('zalando.org', 'v1', 'kopfexamples', initial_backoff=30) + @kopf.daemon('zalando.org', 'v1', 'kopfexamples', initial_delay=30) async def monitor_kex(stopped, **kwargs): while True: await asyncio.sleep(1.0) diff --git a/docs/timers.rst b/docs/timers.rst index 2b2d27a9..2db87d1c 100644 --- a/docs/timers.rst +++ b/docs/timers.rst @@ -95,7 +95,7 @@ It is possible to postpone the invocations: import time import kopf - @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=1, initial_backoff=5) + @kopf.timer('zalando.org', 'v1', 'kopfexamples', interval=1, initial_delay=5) def ping_kex(spec, **kwargs): print(f"FIELD={spec['field']}") @@ -115,7 +115,7 @@ the resources every 10 seconds if they are unmodified for 10 minutes: import kopf @kopf.timer('zalando.org', 'v1', 'kopfexamples', - initial_backoff=60, interval=10, idle=600) + initial_delay=60, interval=10, idle=600) def ping_kex(spec, **kwargs): pass diff --git a/kopf/on.py b/kopf/on.py index 0d9fe8ca..bbc47fbb 100644 --- a/kopf/on.py +++ b/kopf/on.py @@ -337,7 +337,7 @@ def daemon( # lgtm[py/similar-function] labels: Optional[filters.MetaFilter] = None, annotations: Optional[filters.MetaFilter] = None, when: Optional[callbacks.WhenFilterFn] = None, - initial_backoff: Optional[float] = None, + initial_delay: Optional[float] = None, cancellation_backoff: Optional[float] = None, cancellation_timeout: Optional[float] = None, cancellation_polling: Optional[float] = None, @@ -353,7 +353,7 @@ def decorator(fn: callbacks.ResourceDaemonFn) -> callbacks.ResourceDaemonFn: fn=fn, id=real_id, errors=errors, timeout=timeout, retries=retries, backoff=backoff, cooldown=cooldown, labels=labels, annotations=annotations, when=when, - initial_backoff=initial_backoff, requires_finalizer=True, #TODO: requires_finalizer? "optional"? + initial_delay=initial_delay, requires_finalizer=True, #TODO: requires_finalizer? "optional"? cancellation_backoff=cancellation_backoff, cancellation_timeout=cancellation_timeout, cancellation_polling=cancellation_polling, @@ -376,7 +376,7 @@ def timer( # lgtm[py/similar-function] labels: Optional[filters.MetaFilter] = None, annotations: Optional[filters.MetaFilter] = None, when: Optional[callbacks.WhenFilterFn] = None, - initial_backoff: Optional[float] = None, + initial_delay: Optional[float] = None, sharp: Optional[bool] = None, idle: Optional[float] = None, interval: Optional[float] = None, @@ -392,7 +392,7 @@ def decorator(fn: callbacks.ResourceTimerFn) -> callbacks.ResourceTimerFn: fn=fn, id=real_id, errors=errors, timeout=timeout, retries=retries, backoff=backoff, cooldown=cooldown, labels=labels, annotations=annotations, when=when, - initial_backoff=initial_backoff, requires_finalizer=None, + initial_delay=initial_delay, requires_finalizer=None, sharp=sharp, idle=idle, interval=interval, ) real_registry.resource_spawning_handlers[real_resource].append(handler) diff --git a/kopf/reactor/daemons.py b/kopf/reactor/daemons.py index b9dac966..c8afb3ee 100644 --- a/kopf/reactor/daemons.py +++ b/kopf/reactor/daemons.py @@ -303,8 +303,8 @@ async def _resource_daemon( """ logger = cause.logger - if handler.initial_backoff is not None: - await sleeping.sleep_or_wait(handler.initial_backoff, cause.stopper) + if handler.initial_delay is not None: + await sleeping.sleep_or_wait(handler.initial_delay, cause.stopper) # Similar to activities (in-memory execution), but applies patches on every attempt. state = states.State.from_scratch(handlers=[handler]) @@ -363,8 +363,8 @@ async def _resource_timer( but calls the handling functions from time to time. """ - if handler.initial_backoff is not None: - await sleeping.sleep_or_wait(handler.initial_backoff, cause.stopper) + if handler.initial_delay is not None: + await sleeping.sleep_or_wait(handler.initial_delay, cause.stopper) # Similar to activities (in-memory execution), but applies patches on every attempt. state = states.State.from_scratch(handlers=[handler]) diff --git a/kopf/structs/handlers.py b/kopf/structs/handlers.py index a7a7d9b1..31aa58df 100644 --- a/kopf/structs/handlers.py +++ b/kopf/structs/handlers.py @@ -139,7 +139,7 @@ def event(self) -> Optional[Reason]: @dataclasses.dataclass class ResourceSpawningHandler(ResourceHandler): requires_finalizer: Optional[bool] - initial_backoff: Optional[float] + initial_delay: Optional[float] @dataclasses.dataclass diff --git a/tests/handling/daemons/test_daemon_spawning.py b/tests/handling/daemons/test_daemon_spawning.py index 9539f15e..07252899 100644 --- a/tests/handling/daemons/test_daemon_spawning.py +++ b/tests/handling/daemons/test_daemon_spawning.py @@ -22,13 +22,13 @@ async def fn(**kwargs): assert dummy.mock.call_count == 1 # not restarted -async def test_daemon_initial_backoff_obeyed( +async def test_daemon_initial_delay_obeyed( registry, resource, dummy, caplog, assert_logs, k8s_mocked, simulate_cycle): caplog.set_level(logging.DEBUG) @kopf.daemon(resource.group, resource.version, resource.plural, registry=registry, id='fn', - initial_backoff=1.0) + initial_delay=1.0) async def fn(**kwargs): dummy.mock() dummy.kwargs = kwargs diff --git a/tests/handling/daemons/test_timer_triggering.py b/tests/handling/daemons/test_timer_triggering.py index 86fd755f..4dcec37f 100644 --- a/tests/handling/daemons/test_timer_triggering.py +++ b/tests/handling/daemons/test_timer_triggering.py @@ -27,13 +27,13 @@ async def fn(**kwargs): await dummy.wait_for_daemon_done() -async def test_timer_initial_backoff_obeyed( +async def test_timer_initial_delay_obeyed( registry, resource, dummy, caplog, assert_logs, k8s_mocked, simulate_cycle): caplog.set_level(logging.DEBUG) @kopf.timer(resource.group, resource.version, resource.plural, registry=registry, id='fn', - initial_backoff=5.0, interval=1.0) + initial_delay=5.0, interval=1.0) async def fn(**kwargs): dummy.mock() dummy.kwargs = kwargs From 75e227ffed62a039a437bf9a62f40810b55cde3a Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Tue, 31 Mar 2020 17:28:23 +0200 Subject: [PATCH 18/20] Fix some wording and phrasing in the daemon docs (as per PR) --- docs/daemons.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/daemons.rst b/docs/daemons.rst index 15f1f350..7e78e171 100644 --- a/docs/daemons.rst +++ b/docs/daemons.rst @@ -2,14 +2,15 @@ Daemons ======= -Daemons are a special type of handlers for background logic that accompanies +Daemons are a special type of handler for background logic that accompanies the Kubernetes resources during their life cycle. Unlike event-driven short-running handlers declared with ``@kopf.on``, -daemons are started when an object is created or an operator is started, +daemons are started for every individual object when it is created +(or when an operator is started/restarted while the object exists), and are capable of running indefinitely (or infinitely) long. -The daemons are stopped when the object is deleted +The object's daemons are stopped when the object is deleted or the whole operator is exiting/restarting. @@ -89,7 +90,7 @@ it is cancelled when the operator exits and stops all "hung" left-over tasks .. note:: - The MUST_ / SHOULD_ separation is due to Python has no way of terminating + The MUST_ / SHOULD_ separation is due to Python having no way to terminate a thread unless the thread exits by its own. The :kwarg:`stopped` flag is a way to signal the thread it should exit. If :kwarg:`stopped` is not checked, the synchronous daemons will run forever or until an error happens. @@ -187,7 +188,7 @@ configured nor desired, ``stopped.wait()`` can be used too (with ``await``): while not stopped: await stopped.wait(10) -This way, the daemon will exit as soon as possible when the :kwarg:`stopped`` +This way, the daemon will exit as soon as possible when the :kwarg:`stopped` is set, not when the next sleep is over. Therefore, the sleeps can be of any duration while the daemon remains terminable (leads to no OS resource leakage). @@ -217,9 +218,9 @@ It is possible to postpone the daemon spawning: await asyncio.sleep(1.0) -The daemon will not be started during the specified time after it should -be started. For example, this can be used to give some time for regular -event-driven handlers to finish without producing too much activity. +The start of the daemon will be delayed by 30 seconds after the resource +creation (or operator startup). For example, this can be used to give some time +for regular event-driven handlers to finish without producing too much activity. Restarting From 78734f371e0aed35e971d913ce0a6e40a6107de2 Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Tue, 31 Mar 2020 22:38:44 +0200 Subject: [PATCH 19/20] Document the daemons' executors/pools scaling for large clusters --- docs/configuration.rst | 15 +++++++++++++++ docs/daemons.rst | 9 ++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 15e00b98..76f328cc 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -70,6 +70,8 @@ The event-posting can be disabled completely (the default is to be enabled): directly with an API client library instead of Kopf-provided toolkit. +.. _configure-sync-handlers: + Synchronous handlers ==================== @@ -102,6 +104,19 @@ handler itself. To avoid it, make the on-startup handler asynchronous: async def configure(settings: kopf.OperatorSettings, **_): settings.execution.executor = concurrent.futures.ThreadPoolExecutor() +The same executor is used both for regular sync handlers and for sync daemons. +If you expect large number of synchronous daemons (e.g. for large clusters), +make sure to pre-scale the executor accordingly +(the default in Python is 5x times the CPU cores): + +.. code-block:: python + + import kopf + + @kopf.on.startup() + async def configure(settings: kopf.OperatorSettings, **kwargs): + settings.execution.max_workers = 1000 + API timeouts ============ diff --git a/docs/daemons.rst b/docs/daemons.rst index 7e78e171..1e953e48 100644 --- a/docs/daemons.rst +++ b/docs/daemons.rst @@ -40,9 +40,12 @@ with ``@kopf.daemon`` and make it run for long time or forever: Synchronous functions are executed in threads, asynchronous functions are executed directly in the asyncio event loop of the operator -- same as with -regular handlers, except that a separate thread pool is used for daemons -to prevent thread pool depletion for regular handlers. -See :doc:`async`. +regular handlers. See :doc:`async`. + +The same executor is used both for regular sync handlers and for sync daemons. +If you expect large number of synchronous daemons (e.g. for large clusters), +make sure to pre-scale the executor accordingly. +See :doc:`configuration` (:ref:`configure-sync-handlers`). Termination From 8748486f8156202cfd078bdebb2454ebd3dcca0e Mon Sep 17 00:00:00 2001 From: Sergey Vasilyev Date: Tue, 31 Mar 2020 22:41:52 +0200 Subject: [PATCH 20/20] Fix few more wording & phrasing issues (as per PR) --- docs/timers.rst | 2 +- kopf/reactor/daemons.py | 2 +- kopf/reactor/processing.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/timers.rst b/docs/timers.rst index 2db87d1c..c71f2de5 100644 --- a/docs/timers.rst +++ b/docs/timers.rst @@ -137,7 +137,7 @@ When an error happens, its delay overrides the timer's schedule or life cycle: * For `kopf.TemporaryError`, the error's ``delay=...`` option is used. * For `kopf.PermanentError`, the timer stops forever and is never retried. -The timer's own interval is only used of the function exits successfully. +The timer's own interval is only used if the function exits successfully. For example, if the handler fails 3 times with a back-off time set to 5 seconds and the interval set to 10 seconds, it will take 25 seconds (``3*5+10``) diff --git a/kopf/reactor/daemons.py b/kopf/reactor/daemons.py index c8afb3ee..c82a87f9 100644 --- a/kopf/reactor/daemons.py +++ b/kopf/reactor/daemons.py @@ -57,7 +57,7 @@ async def spawn_resource_daemons( be idempotent: not having duplicating side-effects on multiple calls. """ if memory.live_fresh_body is None: # for type-checking; "not None" is ensured in processing. - raise RuntimeError("A daemon is spawned with None as body. This is a bug. Please repor.") + raise RuntimeError("A daemon is spawned with None as body. This is a bug. Please report.") for handler in handlers: daemon_id = containers.DaemonId(handler.id) if daemon_id not in daemons: diff --git a/kopf/reactor/processing.py b/kopf/reactor/processing.py index a3303758..e60784e0 100644 --- a/kopf/reactor/processing.py +++ b/kopf/reactor/processing.py @@ -185,7 +185,7 @@ async def process_resource_event( # Whatever was done, apply the accumulated changes to the object, or sleep-n-touch for delays. # But only once, to reduce the number of API calls and the generated irrelevant events. - # And only of the object is at least supposed to exist (not "GONE"), even if actually does not. + # And only if the object is at least supposed to exist (not "GONE"), even if actually does not. if raw_event['type'] != 'DELETED': await apply_reaction_outcomes( resource=resource, body=body, @@ -279,7 +279,7 @@ async def process_resource_spawning_cause( handlers invocation (even if it takes few cycles). For this, the signal to terminate is sent to the daemons immediately, but the actual check of their shutdown is performed only when all the on-deletion handlers - are succeeded (or after they were invoked if they are optional; + have succeeded (or after they were invoked if they are optional; or immediately if there were no on-deletion handlers to invoke at all). The resource remains blocked by the finalizers until all the daemons exit