forked from cloudalchemy/ansible-prometheus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.yml
144 lines (128 loc) · 4.98 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
---
prometheus_version: 2.13.1
prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''
prometheus_storage_retention: "30d"
# Available since Prometheus 2.7.0
# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
# supported: KB, MB, GB, TB, PB.
prometheus_storage_retention_size: "0"
prometheus_config_flags_extra: {}
# prometheus_config_flags_extra:
# storage.tsdb.retention: 15d
# alertmanager.timeout: 10s
prometheus_alertmanager_config: []
# prometheus_alertmanager_config:
# - scheme: https
# path_prefix: /alertmanager
# basic_auth:
# username: user
# password: pass
# static_configs:
# - targets: ["127.0.0.1:9093"]
# proxy_url: "127.0.0.2"
prometheus_alert_relabel_configs: []
# prometheus_alert_relabel_configs:
# - action: labeldrop
# regex: replica
prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
prometheus_remote_write: []
# prometheus_remote_write:
# - url: https://dev.kausal.co/prom/push
# basic_auth:
# password: FOO
prometheus_remote_read: []
# prometheus_remote_read:
# - url: https://demo.cloudalchemy.org:9201/read
# basic_auth:
# password: FOO
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_targets: {}
# node:
# - targets:
# - localhost:9100
# labels:
# env: test
prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "{{ prometheus_metrics_path }}"
static_configs:
- targets:
- "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
- job_name: "node"
file_sd_configs:
- files:
- "{{ prometheus_config_dir }}/file_sd/node.yml"
# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'
prometheus_alert_rules_files:
- prometheus/rules/*.rules
prometheus_static_targets_files:
- prometheus/targets/*.yml
- prometheus/targets/*.json
prometheus_alert_rules:
- alert: Watchdog
expr: vector(1)
for: 10m
labels:
severity: warning
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.'
summary: 'Ensure entire alerting pipeline is functional'
- alert: InstanceDown
expr: "up == 0"
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
- alert: CriticalCPULoad
expr: '100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100) > 96'
for: 2m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 2 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical CPU load{% endraw %}"
- alert: CriticalRAMUsage
expr: '(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100 > 98'
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} has Critical Memory Usage{% endraw %}"
- alert: CriticalDiskSpace
expr: 'node_filesystem_free_bytes{mountpoint!~"^/run(/.*|$)",fstype!~"(squashfs|fuse.*)",job="node"} / node_filesystem_size_bytes{job="node"} < 0.1'
for: 4m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical disk space usage{% endraw %}"
- alert: RebootRequired
expr: "node_reboot_required > 0"
labels:
severity: warning
annotations:
description: "{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}"
- alert: ClockSkewDetected
expr: 'abs(node_timex_offset_seconds) * 1000 > 30'
for: 2m
labels:
severity: warning
annotations:
description: "{% raw %}Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured correctly on this host.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Clock skew detected{% endraw %}"