Skip to content

Commit

Permalink
Merge pull request #238 from alert-18
Browse files Browse the repository at this point in the history
More monitoring & alerting from 2018-epic, closes #226.
  • Loading branch information
darkk authored Oct 31, 2018
2 parents bf9c967 + e7a5b2e commit eaeb1c1
Show file tree
Hide file tree
Showing 13 changed files with 116 additions and 29 deletions.
9 changes: 9 additions & 0 deletions ansible/host_vars/prometheus.infra.ooni.io/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,13 @@ blackbox_jobs:
- name: "tor testhelper" # see also SSLCertExpires in roles/prometheus/files/alert_rules.yml
module: tls_snakeoil
targets: "{{ groups['have_tor'] | map('regex_replace', '$', ':9001') | list }}"

- name: ssh
module: ssh_banner
# staticiforme.torproject.org ssh port is only reachable via jumphost
targets: "{{ groups['all'] | difference(['staticiforme.torproject.org']) | map('regex_replace', '$', ':22') | list }}"

- name: icmp
module: icmp
targets: "{{ groups['all'] | list }}"
...
2 changes: 2 additions & 0 deletions ansible/inventory
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ ath
ber
tpo
bigv
fra

[gh:children]
wdc # technically it's i95.net, Radio Free Asia network, but GH has some boxes there
Expand Down Expand Up @@ -48,6 +49,7 @@ ooni-explorer-next.test.ooni.io
wiki.ooni.io
labs.ooni.io
hkgjump.ooni.nu
hkgsuperset.ooni.io

[ams]
explorer.ooni.io
Expand Down
10 changes: 10 additions & 0 deletions ansible/inventory-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@
---
- import_playbook: ansible-version.yml

- hosts: all
gather_facts: false
tasks:
- name: ensure that all inventory hosts are rooted to dom0
assert:
that:
- groups.all | difference(groups.dom0) | length == 0
msg: "Hosts in inventory not rooted to dom0: {{ groups.all | difference(groups.dom0) | sort | join(' ') }}"
run_once: true

- hosts: all
vars:
ansible_become: false # root is not required here, also it's not `become: false` as variable declraed for `all` has precedence over directive :-/
Expand Down
32 changes: 9 additions & 23 deletions ansible/roles/airflow/files/nodeexp_airflow
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,23 @@ export PGHOSTADDR=`docker inspect af-psql | jq -r '.[].NetworkSettings.Networks.

dest=`mktemp -t textfile.XXXXXXXXXX`

# It previously was airflow_7day_task_count{state=*} (the total number of
# TaskInstances within last 7 days in specific state). It turned out that only
# `failed` state matters.
cat >"$dest" <<EOF
# HELP airflow_7day_task_count The total number of TaskInstances within last 7 days in specific state.
# TYPE airflow_7day_task_count gauge
# HELP airflow_7d_failed_count The total number of failed TaskInstances within last 7 days.
# TYPE airflow_7d_failed_count gauge
EOF

/sbin/start-stop-daemon --chuid daemon:daemon \
--exec /usr/bin/psql --start \
-- \
--no-align --tuples-only airflow >>"$dest" <<EOF
WITH known_state (state) AS ( VALUES
('None'), -- NB: JOIN on NULL is not possible
('removed'),
('scheduled'),
('queued'),
('running'),
('success'),
('shutdown'),
('failed'),
('up_for_retry'),
('upstream_failed'),
('skipped'))
SELECT
'airflow_7day_task_count{state="' || state || '"} ' || COALESCE(cnt, 0)
FROM
known_state
LEFT JOIN (
SELECT COALESCE(state, 'None') as state, COUNT(*) AS cnt
FROM task_instance
WHERE execution_date >= NOW() - INTERVAL '7 DAYS'
GROUP BY state
) t1 USING (state)
'airflow_7d_failed_count{task="' || task_id || '",dag="' || dag_id || '"} ' || SUM(CASE WHEN state = 'failed' THEN 1 ELSE 0 END)
FROM task_instance
WHERE execution_date >= NOW() - INTERVAL '7 DAYS'
GROUP BY task_id, dag_id
EOF

cp "$dest" /run/nodeexp/airflow.prom
Expand Down
1 change: 0 additions & 1 deletion ansible/roles/airflow/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

- name: copy nodeexp_airflow check
copy: src=nodeexp_airflow dest=/etc/cron.hourly/nodeexp_airflow mode=0555 owner=root group=root mode=0555
notify: restart af-worker # because the _file_ is bind-monted, not a directory

- name: shared airflow config templates
file: dest=/srv/etc/af-share-{{ airflow_tag }} state=directory owner=root group=root mode=0750
Expand Down
8 changes: 8 additions & 0 deletions ansible/roles/blackbox_exporter/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
group: root
remote_src: true

# TODO: AmbientCapabilities needs systemd-229 we currently have systemd-215
- name: Set cap_net_raw for blackbox_exporter
capabilities:
path: '{{ blackbox_exporter_path }}/blackbox_exporter'
capability: cap_net_raw+ep
state: present

- name: Install config file
template:
src: blackbox.yml.j2
Expand All @@ -41,6 +48,7 @@

- name: Install blackbox_exporter systemd service file
notify:
- systemctl daemon-reload
- restart blackbox_exporter
template:
src: blackbox_exporter.service
Expand Down
3 changes: 3 additions & 0 deletions ansible/roles/blackbox_exporter/templates/blackbox.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ modules:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: ip4
query_response:
- expect: "^SSH-2.0-"
send: "SSH-2.0-blackbox_exporter OONI-prometheus-0.0\x0d" # WTF: \x0a is auto-added https://github.com/prometheus/blackbox_exporter/blob/master/tcp.go#L127
Expand All @@ -43,6 +44,8 @@ modules:
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip4

tls_snakeoil:
prober: tcp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ Description=Prometheus Blackbox Exporter
User={{ passwd.prombbox.login }}
Group={{ passwd.prombbox.login }}
EnvironmentFile=-/etc/default/blackbox_exporter
AmbientCapabilities=CAP_NET_RAW
# TODO: AmbientCapabilities needs systemd-229 we currently have systemd-215
# AmbientCapabilities=CAP_NET_RAW
CapabilityBoundingSet=CAP_NET_RAW
ExecStart={{ blackbox_exporter_path }}/blackbox_exporter \
--config.file={{ blackbox_exporter_base }}/blackbox.yml \
Expand Down
3 changes: 3 additions & 0 deletions ansible/roles/node_exporter/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ node_exporter_collectors: >
--no-collector.arp --no-collector.bcache --no-collector.infiniband
--no-collector.ipvs --no-collector.wifi --no-collector.zfs
--collector.ntp --collector.ntp.local-offset-tolerance=5ms
{% if ansible_service_mgr == 'systemd' %}
--collector.systemd --collector.systemd.unit-whitelist=^$
{% endif %}
node_exporter_disk_ignored: ""

Expand Down
4 changes: 3 additions & 1 deletion ansible/roles/node_exporter/files/seeksample.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ def exporter(dataq, fpath):
'# HELP node_seeksample_timestamp Unixtime of last successful seek() sampling.\n'
'# TYPE node_seeksample_timestamp gauge\n'
]+ ['node_seeksample_timestamp{device="%s"} %f\n' % (d, stats[d]['timestamp']) for d in stats])
with open(fpath, 'r+') as fd: # `r+` instead of `w` should prevent memory allocation that may be hard during OOM
# `r+` instead of `w` should prevent memory allocation that may be hard during OOM
mode = 'r+' if os.path.exists(fpath) else 'w'
with open(fpath, mode) as fd:
size = os.path.getsize(fpath)
if size > len(prom):
prom += '#' * (size - len(prom) - 1) # pseudo-atomicity to have fixed-size file
Expand Down
11 changes: 11 additions & 0 deletions ansible/roles/node_exporter/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@
owner: root
group: root
remote_src: true # file is ALREADY on the remote system. sigh.
creates: '{{ node_exporter_base }}/{{ node_exporter_basename }}/node_exporter'

# for some unknown reason some nodes do not have `dbus`, but systemd depends on it :-/
- name: install dbus to punch hole to systemd
apt:
name: dbus
state: present
update_cache: yes
cache_valid_time: 28800
install_recommends: false
when: ansible_service_mgr == 'systemd'

- name: Install node_exporter systemd service file
notify:
Expand Down
6 changes: 4 additions & 2 deletions ansible/roles/prometheus/files/alert_ooni.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ groups:
annotations:
summary: '`rsync` exitcode: {{ $value }}'

# Some nodes may have it broken, but it usually works, here is expr to validate it.
# expr: (max without (device) (node_seeksample_timestamp{job="node"}) or up{job="node"}) == 1
- alert: DiskStuck
expr: node_time - ignoring(device) group_right node_seeksample_timestamp > 240
annotations:
Expand All @@ -26,9 +28,9 @@ groups:
description: '{{ if eq $labels.instance "ooni.torproject.org:443" }}Run `make update-site`.{{ end }}'

- alert: AirflowFailed
expr: airflow_7day_task_count{state="failed"} > 0
expr: airflow_7d_failed_count > 0
annotations:
summary: '{{ $value }} tasks report failures last week'
summary: '{{ $value }} task `{{ $labels.task }}` failures last week'

- alert: SlacktopusGone
expr: time() - slacktopus_ctcp_pong_time > 660 # it's randomized */5 crontab
Expand Down
53 changes: 52 additions & 1 deletion ansible/roles/prometheus/files/alert_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,56 @@ groups:
annotations:
summary: '{{ $labels.instance }} is not `up`'

- alert: systemd # yes, just "systemd", it's unclear what's going wrong :-)
expr: node_systemd_system_running != 1 # that's basically output of `systemctl is-system-running`
annotations:
summary: '{{ $labels.instance }} is not OK, check `systemctl list-units | grep failed`'

- alert: IOWaitHigh
expr: max(irate(node_cpu{mode="iowait"}[1m])) by (instance) > 0.9
expr: sum without (cpu) (irate(node_cpu{mode="iowait"}[1m])) > 0.9
for: 5m # matters to avoid spikes
annotations:
summary: '{{ $labels.instance }} %iowait over 90%'

# the difference between node_disk_{io,read,write}_time_ms is not clear, `io` is NOT `read + write`, it may be greater, it may be less...
# All the nodes have `node_disk_io_time_ms`, but it can be verified with expr: (sum without(device) (node_disk_io_time_ms{job="node"}) or up{job="node"}) == 1
- alert: IOHigh
expr: irate(node_disk_io_time_ms{device!~"(nbd[0-9]+|dm-[0-9]+|ram[0-9]+|sr[0-9]+|md[0-9]+)"}[1m]) > 800
for: 2h
annotations:
summary: '{{ $labels.instance }}/{{ $labels.device }} spends {{ $value }}ms/s in IO over 2 hours'

- alert: CPUHigh
expr: sum without (mode, cpu) (irate(node_cpu{mode!="idle"}[1m])) > 0.75
for: 8h
annotations:
summary: '{{ $labels.instance }} has {{ printf "%.2f" $value }} CPU used over 8 hours'

- alert: NetworkRXHigh
expr: irate(node_network_receive_bytes{device!~"(docker0|veth[0-9a-f]{7}|lo|br[-a-z].*|dummy0)"}[1m]) * 8 > 50 * 1024 * 1024 # OONITestHelper has BandwidthRate 20MBits
for: 1h
annotations:
summary: '{{ $labels.instance }}/{{ $labels.device }} gets {{ $value | humanize }}bit/s'

- alert: NetworkTXHigh
expr: irate(node_network_transmit_bytes{device!~"(docker0|veth[0-9a-f]{7}|lo|br[-a-z].*|dummy0)"}[1m]) * 8 > 50 * 1024 * 1024 # OONITestHelper has BandwidthRate 20MBits
for: 1h
annotations:
summary: '{{ $labels.instance }}/{{ $labels.device }} sends {{ $value | humanize }}bit/s'

- alert: LoadAverageHigh
expr: node_load15 > 8 # largest node available has NCPU=4
for: 1h
annotations:
summary: '{{ $labels.instance }} has LA15 {{ printf "%.1f" $value }}'

# chameleon.infra.ooni.io and b.echo.th.ooni.io do not have MemAvailable signal that is an _estimate_ of RAM available for userspace allocations provided by kernel.
# These old nodes can be listed with expr: (node_memory_MemAvailable or (up - 1)) / node_memory_MemTotal == 0
- alert: MemoryLow
expr: ((node_memory_MemAvailable) / node_memory_MemTotal or ((node_memory_MemFree + node_memory_Buffers + node_memory_Cached) / node_memory_MemTotal)) * 100 < 20
annotations:
summary: '{{ $labels.instance }} has {{ printf "%.0f" $value }}% RAM available'

- alert: NodeSwapping
expr: rate(node_vmstat_pgmajfault[1m]) > 100 # and instance:node_memory_available:ratio < 0.05
for: 15m # number stolen from https://dev.gitlab.org/cookbooks/runbooks/blob/master/rules/node.yml
Expand Down Expand Up @@ -95,4 +139,11 @@ groups:
annotations:
summary: '`certbot` failed at {{ $labels.instance }}'
description: 'SSL cert for expires in {{ printf "%.0f" $value }} days.'

- name: Prometheus self-care
rules:
- alert: AlertmanagerNotificationsFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
annotations:
summary: 'Alertmanager {{ $labels.instance }} fails {{ $labels.integration }} notifications.'
...

0 comments on commit eaeb1c1

Please sign in to comment.