static monitor config (#113)

adding static monitor configuration feature * Updated README.md * Created static_conf.yml
FairwindsOps · Jul 1, 2020 · 86f7eb8 · 86f7eb8
1 parent 4214ed5
commit 86f7eb8
Show file tree

Hide file tree

Showing 12 changed files with 298 additions and 246 deletions.
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ rulesets:
 
 * `cluster_variables`: (dict).  A collection of variables that can be used in monitors.  They can be used in monitors by prepending with `ClusterVariables`, eg `{{ ClusterVariables.var1 }}`.
 * `rulesets`: (List).  A collection of rulesets.  A ruleset consists of a Kubernetes resource type, annotations the resource must have to be considered valid, and a collection of monitors to manage for the resource.
+  * `type`: (String). The type of resource to match if matching with annotations. Can also be `static` or `binding`. Currently supports `deployment`, `namespace`, `binding`, and `static` as values.
   * `match_annotations`: (List).  A collection of name/value pairs pairs of annotations that must be present on the resource to manage it.
   * `bound_objects`: (List).  A collection of object types that are bound to this object.  For instance, if you have a ruleset for a namespace, you can bind other objects like deployments, services, etc. Then, when the bound objects in the namespace get updated, those rulesets apply to it.
   * `monitors`: (Map).  A collection of monitors to manage for any resource that matches the rules defined.
@@ -111,13 +112,19 @@ rulesets:
         * `include_tags`: When true, notifications from this monitor automatically insert triggering tags into the title.
         * `require_full_window`: boolean indicating if a monitor needs a full window of data to be evaluated.
         * `locked`: boolean indicating if changes are only allowed from the creator or admins.
+#### Static monitors
+A static monitor is one that does not depend on the presence of a resource in the kubernetes cluster. An example of a 
+static monitor would be `Host CPU Usage`. There are a variety of example static monitors in the [static_conf.yml example](./static_conf.yml)
 
 #### A Note on Templating
 Since Datadog uses a very similar templating language to go templating, to pass a template variable to Datadog it must be "escaped" by inserting it as a template literal:
 
 ```
 {{ "{{/is_alert}}" }}
 ```
+
+The above note is not applicable for static monitors and if extra brackets are present, creation of the static monitors will fail.
+
 ## Overriding Configuration
 
 It is possible to override monitor elements using Kubernetes resource annotations.
@@ -135,7 +142,7 @@ As of now, the only fields that can be overridden are:
 * query
 * type
 
-Additionally, templating in the override is currently not available.
+Templating in the override is currently not available.
 
 ## Contributing
 PRs welcome! Check out the [Contributing Guidelines](CONTRIBUTING.md),

diff --git a/cmd/root.go b/cmd/root.go
@@ -67,7 +67,7 @@ func init() {
 	rootCmd.PersistentFlags().StringVarP(&metricsPort, "metrics-port", "p", ":8080", "The address to serve prometheus metrics.")
 	rootCmd.PersistentFlags().StringVar(&namespace, "namespace", "kube-system", "The namespace where astro is running")
 }
-func leaderElection(cmd *cobra.Command, args []string) {
+func leaderElection(*cobra.Command, []string) {
 	log.SetOutput(os.Stdout)
 	log.SetLevel(logLevels[strings.ToLower(logLevel)])
 
@@ -127,7 +127,7 @@ func leaderElection(cmd *cobra.Command, args []string) {
 }
 
 func run(ctx context.Context, cancel context.CancelFunc) {
-	// create a channel to respond to SIGTERMs
+	// create a channel to respond to SIGTERM and SIGINT
 	signals := make(chan os.Signal, 1)
 	defer close(signals)
 

diff --git a/conf-example.yml b/conf-example.yml
@@ -64,51 +64,10 @@ rulesets:
     - name: astro/admin-bound
       value: "true"
   monitors:
-    ns-high-load-avg:
-      name: "High System Load Average"
-      type: metric alert
-      query: "avg(last_30m):avg:system.load.norm.5{k8s.io/role/master:1} by {host} > 2"
-      message: |-
-        Load average is high on {{ "{{host.name}} {{host.ip}}" }}.
-        This is a normalized load based on the number of CPUs (i.e. ActualLoadAverage / NumberOfCPUs)
-        Is this node over-provisioned? Pods may need to have a CPU limits closer to their requests
-        Is this node doing a lot of I/O? Load average could be high based on high disk or networking I/O. This may be acceptable if application performance is still ok. To reduce I/O-based system load, you may need to artificially limit the number of high-I/O pods running on a single node.
-      tags: []
-      options:
-        notify_audit: false
-        notify_no_data: false
-        new_host_delay: 300
-        thresholds:
-          critical: 2
-        locked: false
-    ns-high-mem-use:
-      name: "Memory Utilization"
-      type: query alert
-      query: "avg(last_15m):avg:system.mem.pct_usable{k8s.io/role/master:1} by {host} < 0.1"
-      message: |-
-        {{ "{{#is_alert}}" }}
-        Running out of free memory on {{ "{{host.name}}" }}
-        {{ "{{/is_alert}}" }}
-        {{ "{{#is_alert_to_warning}}" }}
-        Memory usage has decreased. There is about 30% free
-        {{ "{{/is_alert_to_warning}}" }}
-        {{ "{{#is_alert_recovery}}" }}
-        Memory is below treshold again
-        {{ "{{/is_alert_recovery}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        notify_no_data: false
-        new_host_delay: 300
-        require_full_window: true
-        thresholds:
-          critical: 0.1
-          warning: 0.15
-        locked: false
     ns-pending-pods:
-      name: "Pending Pods"
-      type: metric alert
-      query: "min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running} - sum:kubernetes_state.pod.status_phase{phase:running} + sum:kubernetes_state.pod.status_phase{phase:pending}.fill(zero) >= 1"
+      name: "Pending Pods - {{ .ObjectMeta.Name }}"
+      type: query alert
+      query: "min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} - sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} + sum:kubernetes_state.pod.status_phase{phase:pending,namespace:{{ .ObjectMeta.Name }}}.fill(zero) >= 1"
       message: |-
         {{ "{{#is_alert}}" }}
         There has been at least 1 pod Pending for 30 minutes.
@@ -126,128 +85,8 @@ rulesets:
         notify_no_data: false
         new_host_delay: 300
         thresholds:
-          critical: 1
-        locked: false
-    ns-host-disk-use:
-      name: "Host Disk Usage"
-      type: metric alert
-      query: "avg(last_30m):(avg:system.disk.total{*} by {host} - avg:system.disk.free{*} by {host}) / avg:system.disk.total{*} by {host} * 100 > 90"
-      message: |-
-        {{ "{{#is_alert}}" }}
-        Disk Usage has been above threshold over 30 minutes on {{ "{{host.name}}" }}
-        {{ "{{/is_alert}}" }}
-        {{ "{{#is_warning}}" }}
-        Disk Usage has been above threshold over 30 minutes on {{ "{{host.name}}" }}
-        {{ "{{/is_warning}}" }}
-        {{ "{{^is_alert}}" }}
-        Disk Usage has recovered on {{ "{{host.name}}" }}
-        {{ "{{/is_alert}}" }}
-        {{ "{{^is_warning}}" }}
-        Disk Usage has recovered on {{ "{{host.name}}" }}
-        {{ "{{/is_warning}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        notify_no_data: false
-        new_host_delay: 300
-        require_full_window: true
-        thresholds:
-          critical: 90
-          warning: 80
-          warning_recovery: 75
-          critical_recovery: 85
-        locked: false
-    ns-hpa-errors:
-      name: "HPA Errors"
-      type: event alert
-      query: "events('sources:kubernetes priority:all \"unable to fetch metrics from resource metrics API:\"').by('hpa').rollup('count').last('1h') > 200"
-      message: |-
-        {{ "{{#is_alert}}" }}
-        A high number of hpa failures (> {{ "{{threshold}}" }} ) are occurring.  Can HPAs get metrics?
-        {{ "{{/is_alert}}" }}
-        {{ "{{#is_alert_recovery}}" }}
-        HPA Metric Retrieval Failure has recovered.
-        {{ "{{/is_alert_recovery}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        notify_no_data: false
-        require_full_window: true
-        locked: false
-    ns-io-wait-times:
-      name: "I/O Wait Times"
-      type: metric alert
-      query: "avg(last_10m):avg:system.cpu.iowait{*} by {host} > 50"
-      message: |-
-        {{ "{{#is_alert}}" }}
-        The I/O wait time for {host.ip} is very high
-        - Is the EBS volume out of burst capacity for iops?
-        - Is something writing lots of errors to the journal?
-        - Is there a pod doing something unexpected (crash looping, etc)?
-        {{ "{{/is_alert}}" }}
-        {{ "{{^is_alert}}" }}
-        The EBS volume burst capacity is returning to normal.
-        {{ "{{/is_alert}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        new_host_delay: 300
-        notify_no_data: false
-        require_full_window: true
-        locked: false
-        thresholds:
-          critical: 50
-          warning: 30
-    ns-nginx-config-reload-fail:
-      name: "Nginx Config Reload Failure"
-      type: metric alert
-      query: "max(last_5m):max:ingress.nginx_ingress_controller_config_last_reload_successful{*} by {kube_deployment} <= 0"
-      message: |-
-        {{ "{{#is_alert}}" }}
-        The last nginx config reload for {{ "{{kube_deployment.name}}" }} failed!  Are there any bad ingress configs?  Does the nginx config have a syntax error?
-        {{ "{{/is_alert}}" }}
-        {{ "{{#is_recovery}}" }}
-        Nginx config reloaded successfully!
-        {{ "{{/is_recovery}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        new_host_delay: 300
-        notify_no_data: false
-        require_full_window: true
-        locked: false
-        thresholds:
-          critical: 0
-          critical_recovery: 1
-    ns-node-not-ready:
-      name: "Node is not Ready"
-      type: service check
-      query: |
-        "kubernetes_state.node.ready".by("host").last(20).count_by_status()
-      message: |-
-        {{ "{{#is_alert}}" }}
-        A Node is not ready!
-        Cluster: {{ "{{kubernetescluster.name}}" }}
-        Host: {{ "{{host.name}}" }}
-        IP: {{ "{{host.ip}}" }}
-        {{ "{{check_message}}" }}
-        {{ "{{/is_alert}}" }}
-        {{ "{{#is_recovery}}" }}
-        Node is now ready.
-        Cluster: {{ "{{kubernetescluster.name}}" }}
-        Host: {{ "{{host.name}}" }}
-        IP: {{ "{{host.ip}}" }}
-        {{ "{{/is_recovery}}" }}
-      tags: []
-      options:
-        notify_audit: false
-        no_data_timeframe: 2
-        new_host_delay: 900
-        notify_no_data: false
+          critical: 1.0
         locked: false
-        thresholds:
-          critical: 20
-          ok: 2
 - type: namespace
   match_annotations:
     - name: astro/admin