Skip to content

Commit

Permalink
Add Prometheus role
Browse files Browse the repository at this point in the history
  • Loading branch information
horia-delicoti committed Apr 12, 2023
1 parent 566526e commit 67dad54
Show file tree
Hide file tree
Showing 7 changed files with 391 additions and 0 deletions.
6 changes: 6 additions & 0 deletions prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- name: Prometheus
hosts: all
become: true

roles:
- role: prometheus
200 changes: 200 additions & 0 deletions roles/prometheus/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
prometheus_deployment: install
prometheus_version: 2.43.0
prometheus_url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz"
prometheus_port: 9090

prometheus_binary_install_dir: '/usr/local/bin'
prometheus_config_dir: /etc/prometheus
prometheus_data_dir: /var/lib/prometheus

prometheus_storage_retention: "30d"

# Prometheus Configuration vars
# #############################

prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s

prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"

prometheus_alertmanager_config:
- static_configs:
- targets:
- localhost:9093

prometheus_alert_relabel_configs: []
# prometheus_alert_relabel_configs:
# - action: labeldrop
# regex: replica

prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets:
- localhost:9090
- job_name: "node_exporter"
file_sd_configs:
- files:
- "{{ prometheus_config_dir }}/file_sd/node_exporter.yml"

prometheus_targets:
node_exporter:
- targets:
- localhost:9100

prometheus_alert_rules:
- alert: Watchdog
expr: vector(1)
for: 10m
labels:
severity: warning
annotations:
description: >
This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always
be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification
when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.
summary: 'Ensure entire alerting pipeline is functional'
- alert: InstanceDown
expr: 'up == 0'
for: 5m
labels:
severity: critical
annotations:
description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
- alert: RebootRequired
expr: 'node_reboot_required > 0'
labels:
severity: warning
annotations:
description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is
filling up.{% endraw %}'
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
expr: >
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n
predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.{% endraw %}'
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
expr: >
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n
predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}
summary: 'Filesystem has less than 5% space left.'
expr: >
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n
node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}
summary: 'Filesystem has less than 3% space left.'
expr: >
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n
node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: >
'{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
expr: >
(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n
predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and
is filling up fast.{% endraw %}
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
expr: >
"(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n
predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: >
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
summary: 'Filesystem has less than 5% inodes left.'
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
summary: 'Filesystem has less than 3% inodes left.'
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
summary: 'Network interface is reporting many receive errors.'
expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
summary: 'Network interface is reporting many transmit errors.'
expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
summary: 'Number of conntrack are getting close to the limit'
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
summary: 'Clock skew detected.'
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
summary: 'Clock not synchronising.'
expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
for: 10m
labels:
severity: warning
15 changes: 15 additions & 0 deletions roles/prometheus/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
- name: Restart prometheus
listen: "restart prometheus"
become: true
ansible.builtin.systemd:
daemon_reload: true
name: prometheus
state: restarted

- name: Reload prometheus
listen: "reload prometheus"
become: true
ansible.builtin.systemd:
name: prometheus
state: reloaded
136 changes: 136 additions & 0 deletions roles/prometheus/tasks/install.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
---
- name: Create prometheus system group
ansible.builtin.group:
name: prometheus
system: true
state: present

- name: Create application user
ansible.builtin.user:
name: prometheus
system: true
shell: "/usr/sbin/nologin"
group: prometheus
createhome: false
home: "{{ prometheus_data_dir }}"
register: created_prometheus_user

- name: Display created users
ansible.builtin.debug:
msg: "User: uid:{{ created_prometheus_user.uid }} name:{{ created_prometheus_user.name }} group:{{ created_prometheus_user.group }}"

- name: Create prometheus data directory
ansible.builtin.file:
path: "{{ prometheus_data_dir }}"
state: directory
owner: prometheus
group: prometheus
mode: "0755"

- name: Create prometheus configuration directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
owner: root
group: prometheus
mode: "0770"
with_items:
- "{{ prometheus_config_dir }}"
- "{{ prometheus_config_dir }}/rules"

- name: Get prometheus binary
block:
- name: Download prometheus binary to local folder
become: false
ansible.builtin.get_url:
url: "{{ prometheus_url }}"
dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
mode: "0644"
validate_certs: false
register: _download_archive
until: _download_archive is succeeded
retries: 5
delay: 2

- name: Unpack prometheus binaries
become: false
ansible.builtin.unarchive:
src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz"
dest: "/tmp"
creates: "/tmp/prometheus-{{ prometheus_version }}.linux-amd64/prometheus"
remote_src: true

- name: Propagate official prometheus and promtool binaries
ansible.builtin.copy:
src: "/tmp/prometheus-{{ prometheus_version }}.linux-amd64/{{ item }}"
dest: "{{ prometheus_binary_install_dir }}/{{ item }}"
mode: "0755"
owner: root
group: root
remote_src: true
with_items:
- prometheus
- promtool

- name: Create prometheus configuration file
become: true
ansible.builtin.template:
dest: "{{ prometheus_config_dir }}/prometheus.yml"
force: true
src: prometheus-config.yml.j2
owner: prometheus
group: prometheus
mode: "0640"

# - name: Configure prometheus static targets
# copy:
# content: |
# {{ item.value | to_nice_yaml(indent=2, sort_keys=False) }}
# dest: "{{ prometheus_config_dir }}/file_sd/{{ item.key }}.yml"
# force: true
# owner: prometheus
# group: prometheus
# mode: 0640
# with_dict: "{{ prometheus_targets }}"
# when: prometheus_targets != {}

# - name: Alerting rules file
# ansible.builtin.template:
# src: "alert.rules.j2"
# dest: "{{ prometheus_config_dir }}/rules/ansible_managed.rules"
# owner: prometheus
# group: prometheus
# mode: 0640
# validate: "{{ prometheus_binary_install_dir }}/promtool check rules %s"
# when:
# - prometheus_alert_rules != []
# notify:
# - reload prometheus

# - name: Create systemd service
# ansible.builtin.template:
# src: prometheus.service.j2
# dest: /etc/systemd/system/prometheus.service
# owner: root
# group: root
# mode: 0644
# notify:
# - restart prometheus

# - name: Start and enable service
# ansible.builtin.systemd:
# name: prometheus
# state: started
# enabled: true
# daemon_reload: true

# - name: Force all notified handlers to run at this point
# meta: flush_handlers

# - name: Wait for service to become available
# ansible.builtin.uri:
# url: "http://localhost:{{ prometheus_port }}/"
# register: result
# until: result.status == 200
# retries: 60
# delay: 1
11 changes: 11 additions & 0 deletions roles/prometheus/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---

- name: Install Prometheus
ansible.builtin.include_tasks:
file: roles/prometheus/tasks/install.yml
when: prometheus_deployment == 'install'

- name: Remove Prometheus
ansible.builtin.include_tasks:
file: removal.yaml
when: prometheus_deployment == 'removal'
Empty file.
23 changes: 23 additions & 0 deletions roles/prometheus/templates/prometheus-config.yml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{ ansible_managed | comment }}
# http://prometheus.io/docs/operating/configuration/

global:
{{ prometheus_global | to_nice_yaml(indent=2,sort_keys=False) | indent(2, False) }}
external_labels:
{{ prometheus_external_labels | to_nice_yaml(indent=2,sort_keys=False) | indent(4, False) }}

rule_files:
- {{ prometheus_config_dir }}/rules/*.rules

{% if prometheus_alertmanager_config | length > 0 %}
alerting:
alertmanagers:
{{ prometheus_alertmanager_config | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }}
{% if prometheus_alert_relabel_configs | length > 0 %}
alert_relabel_configs:
{{ prometheus_alert_relabel_configs | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }}
{% endif %}
{% endif %}

scrape_configs:
{{ prometheus_scrape_configs | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }}

0 comments on commit 67dad54

Please sign in to comment.