Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[Alert manager] k8s cert expiration checker #5409

Merged
merged 38 commits into from
Apr 9, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/alert-manager/build/cert-expiration-checker.common.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM python:3.7

COPY ./src/cert-expiration-checker .

RUN pip3 install -r requirements.txt

ENTRYPOINT ["python3", "send_alert.py"]
3 changes: 3 additions & 0 deletions src/alert-manager/config/alert-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ cluster-utilization:
configured: False
use-pylon: False
repeat-interval: '24h'
cert-expiration-checker:
schedule: '0 0 * * *'
alert-residual-days: 30
suiguoxin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: cert-expiration-checker
spec:
schedule: "{{ cluster_cfg["alert-manager"]["cert-expiration-checker"]["schedule"] }}"
jobTemplate:
spec:
template:
spec:
containers:
- name: cert-expiration-checker
image: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}cert-expiration-checker:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}
imagePullPolicy: Always
env:
- name: PAI_URI
{%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
value: "{{ cluster_cfg['pylon']['uri-https']}}"
{%- else %}
value: "{{ cluster_cfg['pylon']['uri']}}"
{%- endif %}
- name: ALERT_RESIDUAL_DAYS
value: "{{ cluster_cfg["alert-manager"]["cert-expiration-checker"]["alert-residual-days"] }}"
volumeMounts:
- mountPath: /usr/local/bin/kubeadm
name: kubeadm
- mountPath: /etc/kubernetes/
name: kubenetes-config
Binyang2014 marked this conversation as resolved.
Show resolved Hide resolved
volumes:
- name: kubeadm
hostPath:
path: /usr/local/bin/kubeadm
- name: kubenetes-config
hostPath:
path: /etc/kubernetes/
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
restartPolicy: OnFailure
nodeSelector:
pai-master: "true"
3 changes: 2 additions & 1 deletion src/alert-manager/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ template-list:
- alert-manager-deployment.yaml
- alert-manager-configmap.yaml
- alert-manager-cronjob.yaml
- alert-manager-cert-expiration-check-cronjob.yaml
- start.sh

start-script: start.sh
Expand All @@ -37,4 +38,4 @@ upgraded-script: upgraded.sh


deploy-rules:
- in: pai-master
- in: pai-master
3 changes: 2 additions & 1 deletion src/alert-manager/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

pushd $(dirname "$0") > /dev/null

# crate configmap for alert-templates
# create configmap for alert-templates
{% if cluster_cfg["alert-manager"]["alert-handler"]["configured"] -%}
{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] -%}
kubectl create configmap alert-templates \
Expand All @@ -34,6 +34,7 @@ kubectl create configmap alert-templates \
kubectl apply --overwrite=true -f rbac.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-configmap.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-deployment.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-cert-expiration-check-cronjob.yaml || exit $?
{% if cluster_cfg["alert-manager"]["cluster-utilization"]["configured"] -%}
kubectl apply --overwrite=true -f alert-manager-cronjob.yaml || exit $?
{% endif -%}
Expand Down
1 change: 1 addition & 0 deletions src/alert-manager/deploy/stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ kubectl delete --ignore-not-found --now configmap/alert-templates
kubectl delete --ignore-not-found --now configmap/alertmanager
kubectl delete --ignore-not-found --now deployment/alertmanager
kubectl delete --ignore-not-found --now cronjob/cluster-utilization
kubectl delete --ignore-not-found --now cronjob/cert-expiration-checker

if kubectl get clusterrolebinding | grep -q "alert-manager-role-binding"; then
kubectl delete clusterrolebinding alert-manager-role-binding || exit $?
Expand Down
Empty file.
10 changes: 10 additions & 0 deletions src/alert-manager/src/cert-expiration-checker/pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[SETTINGS]

max-line-length=140

disable =
missing-docstring,
invalid-name,
cell-var-from-loop,
undefined-loop-variable,
too-many-locals,
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests==2.23.0
60 changes: 60 additions & 0 deletions src/alert-manager/src/cert-expiration-checker/send_alert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from datetime import timezone, datetime, timedelta
import logging
import os
import requests

ALERT_PREFIX = "/alert-manager/api/v1/alerts"
alertResidualDays = int(os.environ.get('ALERT_RESIDUAL_DAYS'))

def enable_request_debug_log(func):
def wrapper(*args, **kwargs):
requests_log = logging.getLogger("urllib3")
level = requests_log.level
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True

try:
return func(*args, **kwargs)
finally:
requests_log.setLevel(level)
requests_log.propagate = False

return wrapper

@enable_request_debug_log
def send_alert(pai_url: str, certExpirationInfo: str):
trigger_time = str(datetime.now(timezone.utc).date())
post_url = pai_url.rstrip("/") + ALERT_PREFIX
alerts = []
alert = {
"labels": {
"alertname": "k8s cert expiration",
"severity": "warn",
"trigger_time": trigger_time,
"annotations": certExpirationInfo,
suiguoxin marked this conversation as resolved.
Show resolved Hide resolved
},
"generatorURL": "alert/script",
}
alerts.append(alert)

logging.info("Sending alerts to alert-manager...")
resp = requests.post(post_url, json=alerts)
resp.raise_for_status()
logging.info("Alerts sent to alert-manager.")

def main():
PAI_URI = os.environ.get("PAI_URI")
certExpirationInfo = os.popen('kubeadm alpha certs check-expiration --config="/etc/kubernetes/kubeadm-config.yaml"').read()
Binyang2014 marked this conversation as resolved.
Show resolved Hide resolved
residualTimes = certExpirationInfo.split()[12::8]
willExpire = False
for residualTime in residualTimes:
if (int(residualTime[:-1]) < alertResidualDays):
send_alert(PAI_URI, certExpirationInfo)

if __name__ == "__main__":
logging.basicConfig(
format=
"%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
level=logging.INFO,
)
main()