-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
566526e
commit 67dad54
Showing
7 changed files
with
391 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
- name: Prometheus | ||
hosts: all | ||
become: true | ||
|
||
roles: | ||
- role: prometheus |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
prometheus_deployment: install | ||
prometheus_version: 2.43.0 | ||
prometheus_url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz" | ||
prometheus_port: 9090 | ||
|
||
prometheus_binary_install_dir: '/usr/local/bin' | ||
prometheus_config_dir: /etc/prometheus | ||
prometheus_data_dir: /var/lib/prometheus | ||
|
||
prometheus_storage_retention: "30d" | ||
|
||
# Prometheus Configuration vars | ||
# ############################# | ||
|
||
prometheus_global: | ||
scrape_interval: 15s | ||
scrape_timeout: 10s | ||
evaluation_interval: 15s | ||
|
||
prometheus_external_labels: | ||
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" | ||
|
||
prometheus_alertmanager_config: | ||
- static_configs: | ||
- targets: | ||
- localhost:9093 | ||
|
||
prometheus_alert_relabel_configs: [] | ||
# prometheus_alert_relabel_configs: | ||
# - action: labeldrop | ||
# regex: replica | ||
|
||
prometheus_scrape_configs: | ||
- job_name: "prometheus" | ||
metrics_path: "/metrics" | ||
static_configs: | ||
- targets: | ||
- localhost:9090 | ||
- job_name: "node_exporter" | ||
file_sd_configs: | ||
- files: | ||
- "{{ prometheus_config_dir }}/file_sd/node_exporter.yml" | ||
|
||
prometheus_targets: | ||
node_exporter: | ||
- targets: | ||
- localhost:9100 | ||
|
||
prometheus_alert_rules: | ||
- alert: Watchdog | ||
expr: vector(1) | ||
for: 10m | ||
labels: | ||
severity: warning | ||
annotations: | ||
description: > | ||
This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always | ||
be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification | ||
when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty. | ||
summary: 'Ensure entire alerting pipeline is functional' | ||
- alert: InstanceDown | ||
expr: 'up == 0' | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' | ||
summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' | ||
- alert: RebootRequired | ||
expr: 'node_reboot_required > 0' | ||
labels: | ||
severity: warning | ||
annotations: | ||
description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' | ||
summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' | ||
- alert: NodeFilesystemSpaceFillingUp | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is | ||
filling up.{% endraw %}' | ||
summary: 'Filesystem is predicted to run out of space within the next 24 hours.' | ||
expr: > | ||
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n | ||
predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeFilesystemSpaceFillingUp | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling | ||
up fast.{% endraw %}' | ||
summary: 'Filesystem is predicted to run out of space within the next 4 hours.' | ||
expr: > | ||
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n | ||
predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: critical | ||
- alert: NodeFilesystemAlmostOutOfSpace | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %} | ||
summary: 'Filesystem has less than 5% space left.' | ||
expr: > | ||
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n | ||
node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeFilesystemAlmostOutOfSpace | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %} | ||
summary: 'Filesystem has less than 3% space left.' | ||
expr: > | ||
(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n | ||
node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: critical | ||
- alert: NodeFilesystemFilesFillingUp | ||
annotations: | ||
description: > | ||
'{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %} | ||
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' | ||
expr: > | ||
(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n | ||
predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeFilesystemFilesFillingUp | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and | ||
is filling up fast.{% endraw %} | ||
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' | ||
expr: > | ||
"(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n | ||
predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n | ||
for: 1h | ||
labels: | ||
severity: critical | ||
- alert: NodeFilesystemAlmostOutOfFiles | ||
annotations: | ||
description: > | ||
{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' | ||
summary: 'Filesystem has less than 5% inodes left.' | ||
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeFilesystemAlmostOutOfFiles | ||
annotations: | ||
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' | ||
summary: 'Filesystem has less than 3% inodes left.' | ||
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" | ||
for: 1h | ||
labels: | ||
severity: critical | ||
- alert: NodeNetworkReceiveErrs | ||
annotations: | ||
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' | ||
summary: 'Network interface is reporting many receive errors.' | ||
expr: "increase(node_network_receive_errs_total[2m]) > 10\n" | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeNetworkTransmitErrs | ||
annotations: | ||
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' | ||
summary: 'Network interface is reporting many transmit errors.' | ||
expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" | ||
for: 1h | ||
labels: | ||
severity: warning | ||
- alert: NodeHighNumberConntrackEntriesUsed | ||
annotations: | ||
description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' | ||
summary: 'Number of conntrack are getting close to the limit' | ||
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" | ||
labels: | ||
severity: warning | ||
- alert: NodeClockSkewDetected | ||
annotations: | ||
message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' | ||
summary: 'Clock skew detected.' | ||
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" | ||
for: 10m | ||
labels: | ||
severity: warning | ||
- alert: NodeClockNotSynchronising | ||
annotations: | ||
message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' | ||
summary: 'Clock not synchronising.' | ||
expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" | ||
for: 10m | ||
labels: | ||
severity: warning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
--- | ||
- name: Restart prometheus | ||
listen: "restart prometheus" | ||
become: true | ||
ansible.builtin.systemd: | ||
daemon_reload: true | ||
name: prometheus | ||
state: restarted | ||
|
||
- name: Reload prometheus | ||
listen: "reload prometheus" | ||
become: true | ||
ansible.builtin.systemd: | ||
name: prometheus | ||
state: reloaded |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
--- | ||
- name: Create prometheus system group | ||
ansible.builtin.group: | ||
name: prometheus | ||
system: true | ||
state: present | ||
|
||
- name: Create application user | ||
ansible.builtin.user: | ||
name: prometheus | ||
system: true | ||
shell: "/usr/sbin/nologin" | ||
group: prometheus | ||
createhome: false | ||
home: "{{ prometheus_data_dir }}" | ||
register: created_prometheus_user | ||
|
||
- name: Display created users | ||
ansible.builtin.debug: | ||
msg: "User: uid:{{ created_prometheus_user.uid }} name:{{ created_prometheus_user.name }} group:{{ created_prometheus_user.group }}" | ||
|
||
- name: Create prometheus data directory | ||
ansible.builtin.file: | ||
path: "{{ prometheus_data_dir }}" | ||
state: directory | ||
owner: prometheus | ||
group: prometheus | ||
mode: "0755" | ||
|
||
- name: Create prometheus configuration directories | ||
ansible.builtin.file: | ||
path: "{{ item }}" | ||
state: directory | ||
owner: root | ||
group: prometheus | ||
mode: "0770" | ||
with_items: | ||
- "{{ prometheus_config_dir }}" | ||
- "{{ prometheus_config_dir }}/rules" | ||
|
||
- name: Get prometheus binary | ||
block: | ||
- name: Download prometheus binary to local folder | ||
become: false | ||
ansible.builtin.get_url: | ||
url: "{{ prometheus_url }}" | ||
dest: "/tmp/prometheus-{{ prometheus_version }}.tar.gz" | ||
mode: "0644" | ||
validate_certs: false | ||
register: _download_archive | ||
until: _download_archive is succeeded | ||
retries: 5 | ||
delay: 2 | ||
|
||
- name: Unpack prometheus binaries | ||
become: false | ||
ansible.builtin.unarchive: | ||
src: "/tmp/prometheus-{{ prometheus_version }}.tar.gz" | ||
dest: "/tmp" | ||
creates: "/tmp/prometheus-{{ prometheus_version }}.linux-amd64/prometheus" | ||
remote_src: true | ||
|
||
- name: Propagate official prometheus and promtool binaries | ||
ansible.builtin.copy: | ||
src: "/tmp/prometheus-{{ prometheus_version }}.linux-amd64/{{ item }}" | ||
dest: "{{ prometheus_binary_install_dir }}/{{ item }}" | ||
mode: "0755" | ||
owner: root | ||
group: root | ||
remote_src: true | ||
with_items: | ||
- prometheus | ||
- promtool | ||
|
||
- name: Create prometheus configuration file | ||
become: true | ||
ansible.builtin.template: | ||
dest: "{{ prometheus_config_dir }}/prometheus.yml" | ||
force: true | ||
src: prometheus-config.yml.j2 | ||
owner: prometheus | ||
group: prometheus | ||
mode: "0640" | ||
|
||
# - name: Configure prometheus static targets | ||
# copy: | ||
# content: | | ||
# {{ item.value | to_nice_yaml(indent=2, sort_keys=False) }} | ||
# dest: "{{ prometheus_config_dir }}/file_sd/{{ item.key }}.yml" | ||
# force: true | ||
# owner: prometheus | ||
# group: prometheus | ||
# mode: 0640 | ||
# with_dict: "{{ prometheus_targets }}" | ||
# when: prometheus_targets != {} | ||
|
||
# - name: Alerting rules file | ||
# ansible.builtin.template: | ||
# src: "alert.rules.j2" | ||
# dest: "{{ prometheus_config_dir }}/rules/ansible_managed.rules" | ||
# owner: prometheus | ||
# group: prometheus | ||
# mode: 0640 | ||
# validate: "{{ prometheus_binary_install_dir }}/promtool check rules %s" | ||
# when: | ||
# - prometheus_alert_rules != [] | ||
# notify: | ||
# - reload prometheus | ||
|
||
# - name: Create systemd service | ||
# ansible.builtin.template: | ||
# src: prometheus.service.j2 | ||
# dest: /etc/systemd/system/prometheus.service | ||
# owner: root | ||
# group: root | ||
# mode: 0644 | ||
# notify: | ||
# - restart prometheus | ||
|
||
# - name: Start and enable service | ||
# ansible.builtin.systemd: | ||
# name: prometheus | ||
# state: started | ||
# enabled: true | ||
# daemon_reload: true | ||
|
||
# - name: Force all notified handlers to run at this point | ||
# meta: flush_handlers | ||
|
||
# - name: Wait for service to become available | ||
# ansible.builtin.uri: | ||
# url: "http://localhost:{{ prometheus_port }}/" | ||
# register: result | ||
# until: result.status == 200 | ||
# retries: 60 | ||
# delay: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
--- | ||
|
||
- name: Install Prometheus | ||
ansible.builtin.include_tasks: | ||
file: roles/prometheus/tasks/install.yml | ||
when: prometheus_deployment == 'install' | ||
|
||
- name: Remove Prometheus | ||
ansible.builtin.include_tasks: | ||
file: removal.yaml | ||
when: prometheus_deployment == 'removal' |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{{ ansible_managed | comment }} | ||
# http://prometheus.io/docs/operating/configuration/ | ||
|
||
global: | ||
{{ prometheus_global | to_nice_yaml(indent=2,sort_keys=False) | indent(2, False) }} | ||
external_labels: | ||
{{ prometheus_external_labels | to_nice_yaml(indent=2,sort_keys=False) | indent(4, False) }} | ||
|
||
rule_files: | ||
- {{ prometheus_config_dir }}/rules/*.rules | ||
|
||
{% if prometheus_alertmanager_config | length > 0 %} | ||
alerting: | ||
alertmanagers: | ||
{{ prometheus_alertmanager_config | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} | ||
{% if prometheus_alert_relabel_configs | length > 0 %} | ||
alert_relabel_configs: | ||
{{ prometheus_alert_relabel_configs | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} | ||
{% endif %} | ||
{% endif %} | ||
|
||
scrape_configs: | ||
{{ prometheus_scrape_configs | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} |