From 82d558960318ae1d8a69fce5a6d065a412a0ea8d Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Mon, 4 May 2020 19:32:42 +0200 Subject: [PATCH 1/4] salt: add a method to fetch a service endpoints retrieve the endpoint for a given service, returning the first IP address and a list of ports. this method will then be used to retrieve alertmanager endpoint and communincate with its API to manage alerts and silences. Refs: #2464 --- salt/_modules/metalk8s_kubernetes_utils.py | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/salt/_modules/metalk8s_kubernetes_utils.py b/salt/_modules/metalk8s_kubernetes_utils.py index b089292557..fb7d78afb6 100644 --- a/salt/_modules/metalk8s_kubernetes_utils.py +++ b/salt/_modules/metalk8s_kubernetes_utils.py @@ -160,3 +160,46 @@ def read_and_render_yaml_file(source, template, context=None, saltenv='base'): template)) return salt.utils.yaml.safe_load(contents) + + +def get_service_endpoints(service, namespace, kubeconfig): + error_tpl = \ + 'Unable to get kubernetes endpoints for {} in namespace {}:\n{!s}' + + try: + endpoint = __salt__['metalk8s_kubernetes.get_object']( + name=service, + kind='Endpoints', + apiVersion='v1', + namespace=namespace, + kubeconfig=kubeconfig, + ) + except CommandExecutionError as exc: + raise CommandExecutionError( + error_tpl.format(service, namespace, exc) + ) + + if not endpoint: + raise CommandExecutionError( + error_tpl.format(service, namespace, 'Endpoint not found') + ) + + try: + # Extract hostname, ip and node_name + result = { + k: v + for k, v in endpoint['subsets'][0]['addresses'][0].items() + if k in ['hostname', 'ip', 'node_name'] + } + + # Add ports info to result dict + result['ports'] = { + port['name']: port['port'] + for port in endpoint['subsets'][0]['ports'] + } + except (AttributeError, IndexError, KeyError) as exc: + raise CommandExecutionError( + error_tpl.format(service, namespace, exc) + ) + + return result From 9ad7c60e082cb679f57bc2a6b4b7a439c7c0c3f6 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Mon, 4 May 2020 19:35:00 +0200 Subject: [PATCH 2/4] salt: use the common method to fetch service endpoints this method has been moved to a common place, so that other modules can use it. Refs: #2464 --- salt/_pillar/metalk8s_endpoints.py | 53 +++++------------------------- 1 file changed, 8 insertions(+), 45 deletions(-) diff --git a/salt/_pillar/metalk8s_endpoints.py b/salt/_pillar/metalk8s_endpoints.py index b9ed0bceed..f60b341ba9 100644 --- a/salt/_pillar/metalk8s_endpoints.py +++ b/salt/_pillar/metalk8s_endpoints.py @@ -3,6 +3,7 @@ import logging import os.path +from salt.exceptions import CommandExecutionError log = logging.getLogger(__name__) @@ -16,46 +17,6 @@ def __virtual__(): return __virtualname__ -def service_endpoints(service, namespace, kubeconfig): - try: - endpoint = __salt__['metalk8s_kubernetes.get_object']( - name=service, - kind='Endpoints', - apiVersion='v1', - namespace=namespace, - kubeconfig=kubeconfig, - ) - - if not endpoint: - return __utils__['pillar_utils.errors_to_dict']([ - 'Endpoint not found: {}'.format(service) - ]) - - # Extract hostname, ip and node_name - res = { - k: v - for k, v in endpoint['subsets'][0]['addresses'][0].items() - if k in ['hostname', 'ip', 'node_name'] - } - - # Add ports info to res dict - ports = { - port['name']: port['port'] - for port in endpoint['subsets'][0]['ports'] - } - res['ports'] = ports - except Exception as exc: # pylint: disable=broad-except - error_tplt = ( - 'Unable to get kubernetes endpoints' - ' for {} in namespace {}:\n{}' - ) - return __utils__['pillar_utils.errors_to_dict']([ - error_tplt.format(service, namespace, exc) - ]) - else: - return res - - def ext_pillar(minion_id, pillar, kubeconfig): services = { "kube-system": ['salt-master', 'repositories'], @@ -72,13 +33,15 @@ def ext_pillar(minion_id, pillar, kubeconfig): for namespace, services in services.items(): for service in services: - endpoints.update( - { - service: service_endpoints( + try: + service_endpoints = \ + __salt__['metalk8s_kubernetes.get_service_endpoints']( service, namespace, kubeconfig ) - } - ) + except CommandExecutionError as exc: + service_endpoints = \ + __utils__['pillar_utils.errors_to_dict'](str(exc)) + endpoints.update({service: service_endpoints}) __utils__['pillar_utils.promote_errors'](endpoints, service) result = { From c5c5894f02e752ba8e7fe6837dea0b388b70e6ff Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Tue, 5 May 2020 11:42:57 +0200 Subject: [PATCH 3/4] salt: add metalk8s utils module add an helper method to automatically retry http requests. Refs: #2464 --- buildchain/buildchain/salt_tree.py | 1 + salt/_utils/metalk8s_utils.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 salt/_utils/metalk8s_utils.py diff --git a/buildchain/buildchain/salt_tree.py b/buildchain/buildchain/salt_tree.py index b15c727e03..8bc53fa940 100644 --- a/buildchain/buildchain/salt_tree.py +++ b/buildchain/buildchain/salt_tree.py @@ -602,6 +602,7 @@ def _get_parts(self) -> Iterator[str]: Path('salt/_states/metalk8s_package_manager.py'), Path('salt/_states/metalk8s_volumes.py'), + Path('salt/_utils/metalk8s_utils.py'), Path('salt/_utils/kubernetes_utils.py'), Path('salt/_utils/pillar_utils.py'), Path('salt/_utils/volume_utils.py'), diff --git a/salt/_utils/metalk8s_utils.py b/salt/_utils/metalk8s_utils.py new file mode 100644 index 0000000000..18f82993eb --- /dev/null +++ b/salt/_utils/metalk8s_utils.py @@ -0,0 +1,58 @@ +"""Utility methods for MetalK8s modules. +""" + +MISSING_DEPS = [] + +try: + import requests + from requests.adapters import HTTPAdapter + from requests.packages.urllib3.util.retry import Retry +except ImportError: + MISSING_DEPS.append('requests') + +__virtualname__ = 'metalk8s' + + +def __virtual__(): + if MISSING_DEPS: + error_msg = 'Missing dependencies: {}'.format(', '.join(MISSING_DEPS)) + return False, error_msg + + return __virtualname__ + + +# Source: https://www.peterbe.com/plog/best-practice-with-retries-with-requests +def requests_retry_session( + retries=3, + backoff_factor=0.3, + status_forcelist=(500, 503), + session=None +): + """Configure a `requests.session` for retry on error. + + By default, this helper performs 3 retries with an exponential sleep + interval between each request and only retries internal server errors(500) + & service unavailable errors(503) + + Arguments: + retries: The number of retries to perform before giving up + backoff_factor: The sleep interval between requests computed as + {backoff factor} * (2 ^ ({number retries} - 1)) + status_forcelist: HTTP status codes that we should force a retry on + session: Used to create a session + + Returns: + A `requests.Session` object configured for retry. + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session From e85b979a4d7b2e17f3c31940b4fc770c271b3d14 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Thu, 30 Apr 2020 17:31:35 +0200 Subject: [PATCH 4/4] salt: add new Salt module for monitoring for now, this module only manages silences (add, delete, list) through call to Alertmanager API. It's also possible to list alerts. Refs: #2464 --- buildchain/buildchain/salt_tree.py | 1 + salt/_modules/metalk8s_monitoring.py | 213 +++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 salt/_modules/metalk8s_monitoring.py diff --git a/buildchain/buildchain/salt_tree.py b/buildchain/buildchain/salt_tree.py index 8bc53fa940..1d593db742 100644 --- a/buildchain/buildchain/salt_tree.py +++ b/buildchain/buildchain/salt_tree.py @@ -571,6 +571,7 @@ def _get_parts(self) -> Iterator[str]: Path('salt/_modules/metalk8s_grafana.py'), Path('salt/_modules/metalk8s_kubernetes.py'), Path('salt/_modules/metalk8s_kubernetes_utils.py'), + Path('salt/_modules/metalk8s_monitoring.py'), Path('salt/_modules/metalk8s_network.py'), Path('salt/_modules/metalk8s_package_manager_yum.py'), Path('salt/_modules/metalk8s_package_manager_apt.py'), diff --git a/salt/_modules/metalk8s_monitoring.py b/salt/_modules/metalk8s_monitoring.py new file mode 100644 index 0000000000..e364ac3a76 --- /dev/null +++ b/salt/_modules/metalk8s_monitoring.py @@ -0,0 +1,213 @@ +"""Utiliy methods to interact with MetalK8s monitoring. +""" + +from salt.exceptions import CommandExecutionError + +from datetime import datetime, timedelta + +MISSING_DEPS = [] + +try: + import requests +except ImportError: + MISSING_DEPS.append('requests') + +__virtualname__ = 'metalk8s_monitoring' + + +def __virtual__(): + if MISSING_DEPS: + error_msg = 'Missing dependencies: {}'.format(', '.join(MISSING_DEPS)) + return False, error_msg + + return __virtualname__ + + +def add_silence(value, name='alertname', is_regex=False, starts_at=None, + duration=3600, ends_at=None, time_format='%Y-%m-%dT%H:%M:%S', + author='', comment='', **kwargs): + """Add a new silence in Alertmanager. + + Arguments: + + value (str): value to check + name (str): label to check + is_regex (bool): Whether `value` should be treated as a regular + expression or not, defaults to False. + starts_at (str): Date when the silence starts, defaults to `now`. + duration (int): Duration of the silence in seconds, defaults to `3600`. + ends_at (str): Date when the silence ends, defaults to + `starts_at` + `duration`. + time_format (str): Time format for `starts_at` and `ends_at` arguments. + Support the `datetime` Python library flags. + author (str): Creator of the silence. + comment (str): A description of why this silence has been put. + + CLI Examples: + + .. code-block:: bash + + salt-call metalk8s_monitoring.add_silence KubeMemOvercommit + salt-call metalk8s_kubernetes.add_silence none name=severity + starts_at="2020-05-05T07:14:52" duration=7200 + """ + if starts_at is None: + starts_at = datetime.now() + else: + starts_at = datetime.strptime(starts_at, time_format) + + if ends_at is None: + ends_at = starts_at + timedelta(seconds=duration) + else: + ends_at = datetime.strptime(ends_at, time_format) + + body = { + "matchers": [{ + "name": name, + "isRegex": is_regex, + "value": value, + }], + "startsAt": starts_at.strftime('%Y-%m-%dT%H:%M:%SZ'), + "endsAt": ends_at.strftime('%Y-%m-%dT%H:%M:%SZ'), + "createdBy": author, + "comment": comment, + "status": {"state": "active"}, + } + + response = _requests_alertmanager_api( + 'api/v1/silences', + 'POST', + json=body, + **kwargs + ) + + return response['silenceId'] + + +def delete_silence(silence_id, **kwargs): + """Delete a silence in Alertmanager + + Arguments: + + silence_id (str): ID of the silence to delete. + + CLI Examples: + + .. code-block:: bash + + salt-call metalk8s_monitoring.delete_silence \ + 64d84a9e-cc6e-41ce-83ff-e84771ff6872 + """ + _requests_alertmanager_api( + 'api/v1/silence/{}'.format(silence_id), + 'DELETE', + **kwargs + ) + + +def get_silences(state=None, **kwargs): + """Get the list of all silences in Alertmanager + + Arguments: + + state (str): Filter silences on their state (e.g. `active`), + if None, return all silences, defaults to `None`. + + CLI Examples: + + .. code-block:: bash + + salt-call metalk8s_monitoring.get_silences + salt-call metalk8s_monitoring.get_silences state=active + """ + response = _requests_alertmanager_api( + 'api/v1/silences', + 'GET', + **kwargs + ) + + if state is not None: + silences = [ + silence for silence in response + if silence['status']['state'] == state + ] + else: + silences = response + + return silences + + +def get_alerts(state=None, **kwargs): + """Get the list of all alerts in Alertmanager + + Arguments: + + state (str): Filter alerts on their state (e.g. `active`), + if None, return all alerts, defaults to `None`. + + CLI Examples: + + .. code-block:: bash + + salt-call metalk8s_monitoring.get_alerts + salt-call metalk8s_monitoring.get_alerts state=suppressed + """ + response = _requests_alertmanager_api( + 'api/v1/alerts', + 'GET', + **kwargs + ) + + if state is not None: + alerts = [ + alert for alert in response + if alert['status']['state'] == state + ] + else: + alerts = response + + return alerts + + +def _requests_alertmanager_api(route, method='GET', **kwargs): + endpoint = __salt__['metalk8s_kubernetes.get_service_endpoints']( + 'prometheus-operator-alertmanager', + 'metalk8s-monitoring', + kwargs.pop('kubeconfig', None), + ) + + url = 'http://{}:{}/{}'.format( + endpoint['ip'], + endpoint['ports']['web'], + route, + ) + + try: + session = __utils__['metalk8s.requests_retry_session']() + response = session.request(method, url, **kwargs) + except Exception as exc: + raise CommandExecutionError( + "Unable to query Alertmanager API on {}: {!s}".format(url, exc) + ) + + try: + json = response.json() + except ValueError as exc: + if response.status_code != requests.codes.ok: + error = ( + "Received HTTP code {} when querying Alertmanager API on {}" + .format(response.status_code, url) + ) + else: + error = ( + "Malformed response returned from Alertmanager API: {!s}: {}" + .format(exc, response.text) + ) + raise CommandExecutionError(error) + + if json['status'] == 'error': + raise CommandExecutionError( + "{}: {}".format(json['errorType'], json['error']) + ) + + return json.get('data')