Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alert silences management #2511

Merged
merged 4 commits into from
May 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions buildchain/buildchain/salt_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ def _get_parts(self) -> Iterator[str]:
Path('salt/_modules/metalk8s_grafana.py'),
Path('salt/_modules/metalk8s_kubernetes.py'),
Path('salt/_modules/metalk8s_kubernetes_utils.py'),
Path('salt/_modules/metalk8s_monitoring.py'),
Path('salt/_modules/metalk8s_network.py'),
Path('salt/_modules/metalk8s_package_manager_yum.py'),
Path('salt/_modules/metalk8s_package_manager_apt.py'),
Expand Down Expand Up @@ -602,6 +603,7 @@ def _get_parts(self) -> Iterator[str]:
Path('salt/_states/metalk8s_package_manager.py'),
Path('salt/_states/metalk8s_volumes.py'),

Path('salt/_utils/metalk8s_utils.py'),
Path('salt/_utils/kubernetes_utils.py'),
Path('salt/_utils/pillar_utils.py'),
Path('salt/_utils/volume_utils.py'),
Expand Down
43 changes: 43 additions & 0 deletions salt/_modules/metalk8s_kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,46 @@ def read_and_render_yaml_file(source, template, context=None, saltenv='base'):
template))

return salt.utils.yaml.safe_load(contents)


def get_service_endpoints(service, namespace, kubeconfig):
error_tpl = \
'Unable to get kubernetes endpoints for {} in namespace {}:\n{!s}'

try:
endpoint = __salt__['metalk8s_kubernetes.get_object'](
name=service,
kind='Endpoints',
apiVersion='v1',
namespace=namespace,
kubeconfig=kubeconfig,
)
except CommandExecutionError as exc:
raise CommandExecutionError(
error_tpl.format(service, namespace, exc)
)

if not endpoint:
raise CommandExecutionError(
error_tpl.format(service, namespace, 'Endpoint not found')
)

try:
# Extract hostname, ip and node_name
result = {
k: v
for k, v in endpoint['subsets'][0]['addresses'][0].items()
if k in ['hostname', 'ip', 'node_name']
}

# Add ports info to result dict
result['ports'] = {
port['name']: port['port']
for port in endpoint['subsets'][0]['ports']
}
except (AttributeError, IndexError, KeyError) as exc:
raise CommandExecutionError(
error_tpl.format(service, namespace, exc)
)

return result
213 changes: 213 additions & 0 deletions salt/_modules/metalk8s_monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""Utiliy methods to interact with MetalK8s monitoring.
"""

from salt.exceptions import CommandExecutionError

from datetime import datetime, timedelta

MISSING_DEPS = []

try:
import requests
except ImportError:
MISSING_DEPS.append('requests')

__virtualname__ = 'metalk8s_monitoring'


def __virtual__():
if MISSING_DEPS:
error_msg = 'Missing dependencies: {}'.format(', '.join(MISSING_DEPS))
return False, error_msg

return __virtualname__


def add_silence(value, name='alertname', is_regex=False, starts_at=None,
duration=3600, ends_at=None, time_format='%Y-%m-%dT%H:%M:%S',
author='', comment='', **kwargs):
"""Add a new silence in Alertmanager.
alexandre-allard marked this conversation as resolved.
Show resolved Hide resolved

Arguments:

value (str): value to check
name (str): label to check
is_regex (bool): Whether `value` should be treated as a regular
expression or not, defaults to False.
starts_at (str): Date when the silence starts, defaults to `now`.
duration (int): Duration of the silence in seconds, defaults to `3600`.
ends_at (str): Date when the silence ends, defaults to
`starts_at` + `duration`.
time_format (str): Time format for `starts_at` and `ends_at` arguments.
Support the `datetime` Python library flags.
author (str): Creator of the silence.
comment (str): A description of why this silence has been put.

CLI Examples:

.. code-block:: bash

salt-call metalk8s_monitoring.add_silence KubeMemOvercommit
salt-call metalk8s_kubernetes.add_silence none name=severity
starts_at="2020-05-05T07:14:52" duration=7200
"""
if starts_at is None:
starts_at = datetime.now()
else:
starts_at = datetime.strptime(starts_at, time_format)

if ends_at is None:
ends_at = starts_at + timedelta(seconds=duration)
else:
ends_at = datetime.strptime(ends_at, time_format)

body = {
"matchers": [{
"name": name,
"isRegex": is_regex,
"value": value,
}],
"startsAt": starts_at.strftime('%Y-%m-%dT%H:%M:%SZ'),
"endsAt": ends_at.strftime('%Y-%m-%dT%H:%M:%SZ'),
"createdBy": author,
"comment": comment,
"status": {"state": "active"},
}

response = _requests_alertmanager_api(
'api/v1/silences',
'POST',
json=body,
**kwargs
)

return response['silenceId']


def delete_silence(silence_id, **kwargs):
"""Delete a silence in Alertmanager

Arguments:

silence_id (str): ID of the silence to delete.

CLI Examples:

.. code-block:: bash

salt-call metalk8s_monitoring.delete_silence \
64d84a9e-cc6e-41ce-83ff-e84771ff6872
"""
_requests_alertmanager_api(
'api/v1/silence/{}'.format(silence_id),
'DELETE',
**kwargs
)


def get_silences(state=None, **kwargs):
"""Get the list of all silences in Alertmanager

Arguments:

state (str): Filter silences on their state (e.g. `active`),
if None, return all silences, defaults to `None`.

CLI Examples:

.. code-block:: bash

salt-call metalk8s_monitoring.get_silences
salt-call metalk8s_monitoring.get_silences state=active
"""
response = _requests_alertmanager_api(
'api/v1/silences',
'GET',
**kwargs
)

if state is not None:
silences = [
silence for silence in response
if silence['status']['state'] == state
]
else:
silences = response
alexandre-allard marked this conversation as resolved.
Show resolved Hide resolved

return silences


def get_alerts(state=None, **kwargs):
"""Get the list of all alerts in Alertmanager

Arguments:

state (str): Filter alerts on their state (e.g. `active`),
if None, return all alerts, defaults to `None`.

CLI Examples:

.. code-block:: bash

salt-call metalk8s_monitoring.get_alerts
salt-call metalk8s_monitoring.get_alerts state=suppressed
"""
response = _requests_alertmanager_api(
'api/v1/alerts',
'GET',
**kwargs
)

if state is not None:
alerts = [
alert for alert in response
if alert['status']['state'] == state
]
else:
alerts = response

return alerts


def _requests_alertmanager_api(route, method='GET', **kwargs):
endpoint = __salt__['metalk8s_kubernetes.get_service_endpoints'](
'prometheus-operator-alertmanager',
'metalk8s-monitoring',
kwargs.pop('kubeconfig', None),
)

url = 'http://{}:{}/{}'.format(
endpoint['ip'],
endpoint['ports']['web'],
route,
)

try:
session = __utils__['metalk8s.requests_retry_session']()
response = session.request(method, url, **kwargs)
except Exception as exc:
raise CommandExecutionError(
"Unable to query Alertmanager API on {}: {!s}".format(url, exc)
)

try:
json = response.json()
except ValueError as exc:
if response.status_code != requests.codes.ok:
error = (
"Received HTTP code {} when querying Alertmanager API on {}"
.format(response.status_code, url)
)
else:
error = (
"Malformed response returned from Alertmanager API: {!s}: {}"
.format(exc, response.text)
)
raise CommandExecutionError(error)

if json['status'] == 'error':
raise CommandExecutionError(
"{}: {}".format(json['errorType'], json['error'])
)

return json.get('data')
53 changes: 8 additions & 45 deletions salt/_pillar/metalk8s_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os.path

from salt.exceptions import CommandExecutionError

log = logging.getLogger(__name__)

Expand All @@ -16,46 +17,6 @@ def __virtual__():
return __virtualname__


def service_endpoints(service, namespace, kubeconfig):
try:
endpoint = __salt__['metalk8s_kubernetes.get_object'](
name=service,
kind='Endpoints',
apiVersion='v1',
namespace=namespace,
kubeconfig=kubeconfig,
)

if not endpoint:
return __utils__['pillar_utils.errors_to_dict']([
'Endpoint not found: {}'.format(service)
])

# Extract hostname, ip and node_name
res = {
k: v
for k, v in endpoint['subsets'][0]['addresses'][0].items()
if k in ['hostname', 'ip', 'node_name']
}

# Add ports info to res dict
ports = {
port['name']: port['port']
for port in endpoint['subsets'][0]['ports']
}
res['ports'] = ports
except Exception as exc: # pylint: disable=broad-except
error_tplt = (
'Unable to get kubernetes endpoints'
' for {} in namespace {}:\n{}'
)
return __utils__['pillar_utils.errors_to_dict']([
error_tplt.format(service, namespace, exc)
])
else:
return res


def ext_pillar(minion_id, pillar, kubeconfig):
services = {
"kube-system": ['salt-master', 'repositories'],
Expand All @@ -72,13 +33,15 @@ def ext_pillar(minion_id, pillar, kubeconfig):

for namespace, services in services.items():
for service in services:
endpoints.update(
{
service: service_endpoints(
try:
service_endpoints = \
__salt__['metalk8s_kubernetes.get_service_endpoints'](
service, namespace, kubeconfig
)
}
)
except CommandExecutionError as exc:
service_endpoints = \
__utils__['pillar_utils.errors_to_dict'](str(exc))
endpoints.update({service: service_endpoints})
__utils__['pillar_utils.promote_errors'](endpoints, service)

result = {
Expand Down
Loading