defaults/main.yml

---
prometheus_version: 2.13.1

prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus

prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''

prometheus_storage_retention: "30d"
# Available since Prometheus 2.7.0
# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
# supported: KB, MB, GB, TB, PB.
prometheus_storage_retention_size: "0"

prometheus_config_flags_extra: {}
# prometheus_config_flags_extra:
#   storage.tsdb.retention: 15d
#   alertmanager.timeout: 10s

prometheus_alertmanager_config: []
# prometheus_alertmanager_config:
#   - scheme: https
#     path_prefix: /alertmanager
#     basic_auth:
#       username: user
#       password: pass
#     static_configs:
#       - targets: ["127.0.0.1:9093"]
#     proxy_url: "127.0.0.2"

prometheus_alert_relabel_configs: []
# prometheus_alert_relabel_configs:
#   - action: labeldrop
#     regex: replica

prometheus_global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s

prometheus_remote_write: []
# prometheus_remote_write:
#   - url: https://dev.kausal.co/prom/push
#     basic_auth:
#       password: FOO

prometheus_remote_read: []
# prometheus_remote_read:
#   - url: https://demo.cloudalchemy.org:9201/read
#     basic_auth:
#       password: FOO

prometheus_external_labels:
  environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"

prometheus_targets: {}
#  node:
#    - targets:
#        - localhost:9100
#      labels:
#        env: test

prometheus_scrape_configs:
  - job_name: "prometheus"
    metrics_path: "{{ prometheus_metrics_path }}"
    static_configs:
      - targets:
          - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
  - job_name: "node"
    file_sd_configs:
      - files:
          - "{{ prometheus_config_dir }}/file_sd/node.yml"

# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'

prometheus_alert_rules_files:
  - prometheus/rules/*.rules

prometheus_static_targets_files:
  - prometheus/targets/*.yml
  - prometheus/targets/*.json

prometheus_alert_rules:
  - alert: Watchdog
    expr: vector(1)
    for: 10m
    labels:
      severity: warning
    annotations:
      description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
        This alert is always firing, therefore it should always be firing in Alertmanager
        and always fire against a receiver. There are integrations with various notification
        mechanisms that send a notification when this alert is not firing. For example the
        "DeadMansSnitch" integration in PagerDuty.'
      summary: 'Ensure entire alerting pipeline is functional'
  - alert: InstanceDown
    expr: "up == 0"
    for: 5m
    labels:
      severity: critical
    annotations:
      description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
  - alert: CriticalCPULoad
    expr: '100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100) > 96'
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 2 minutes.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} - Critical CPU load{% endraw %}"
  - alert: CriticalRAMUsage
    expr: '(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100 > 98'
    for: 5m
    labels:
      severity: critical
    annotations:
      description: "{% raw %}{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} has Critical Memory Usage{% endraw %}"
  - alert: CriticalDiskSpace
    expr: 'node_filesystem_free_bytes{mountpoint!~"^/run(/.*|$)",fstype!~"(squashfs|fuse.*)",job="node"} / node_filesystem_size_bytes{job="node"} < 0.1'
    for: 4m
    labels:
      severity: critical
    annotations:
      description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} - Critical disk space usage{% endraw %}"
  - alert: RebootRequired
    expr: "node_reboot_required > 0"
    labels:
      severity: warning
    annotations:
      description: "{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}"
  - alert: ClockSkewDetected
    expr: 'abs(node_timex_offset_seconds) * 1000 > 30'
    for: 2m
    labels:
      severity: warning
    annotations:
      description: "{% raw %}Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured correctly on this host.{% endraw %}"
      summary: "{% raw %}Instance {{ $labels.instance }} - Clock skew detected{% endraw %}"