Merge pull request #238 from alert-18

More monitoring & alerting from 2018-epic, closes #226.
ooni · Oct 31, 2018 · eaeb1c1 · eaeb1c1
2 parents bf9c967 + e7a5b2e
commit eaeb1c1
Show file tree

Hide file tree

Showing 13 changed files with 116 additions and 29 deletions.
diff --git a/ansible/host_vars/prometheus.infra.ooni.io/vars.yml b/ansible/host_vars/prometheus.infra.ooni.io/vars.yml
@@ -91,4 +91,13 @@ blackbox_jobs:
   - name: "tor testhelper" # see also SSLCertExpires in roles/prometheus/files/alert_rules.yml
     module: tls_snakeoil
     targets: "{{ groups['have_tor'] | map('regex_replace', '$', ':9001') | list }}"
+
+  - name: ssh
+    module: ssh_banner
+    # staticiforme.torproject.org ssh port is only reachable via jumphost
+    targets: "{{ groups['all'] | difference(['staticiforme.torproject.org']) | map('regex_replace', '$', ':22') | list }}"
+
+  - name: icmp
+    module: icmp
+    targets: "{{ groups['all'] | list }}"
 ...
diff --git a/ansible/inventory b/ansible/inventory
@@ -7,6 +7,7 @@ ath
 ber
 tpo
 bigv
+fra
 
 [gh:children]
 wdc # technically it's i95.net, Radio Free Asia network, but GH has some boxes there
@@ -48,6 +49,7 @@ ooni-explorer-next.test.ooni.io
 wiki.ooni.io
 labs.ooni.io
 hkgjump.ooni.nu
+hkgsuperset.ooni.io
 
 [ams]
 explorer.ooni.io

diff --git a/ansible/inventory-check.yml b/ansible/inventory-check.yml
@@ -2,6 +2,16 @@
 ---
 - import_playbook: ansible-version.yml
 
+- hosts: all
+  gather_facts: false
+  tasks:
+    - name: ensure that all inventory hosts are rooted to dom0
+      assert:
+        that:
+        - groups.all | difference(groups.dom0) | length == 0
+        msg: "Hosts in inventory not rooted to dom0: {{ groups.all | difference(groups.dom0) | sort | join(' ') }}"
+      run_once: true
+
 - hosts: all
   vars:
     ansible_become: false # root is not required here, also it's not `become: false` as variable declraed for `all` has precedence over directive :-/

diff --git a/ansible/roles/airflow/files/nodeexp_airflow b/ansible/roles/airflow/files/nodeexp_airflow
@@ -6,37 +6,23 @@ export PGHOSTADDR=`docker inspect af-psql | jq -r '.[].NetworkSettings.Networks.
 
 dest=`mktemp -t textfile.XXXXXXXXXX`
 
+# It previously was airflow_7day_task_count{state=*} (the total number of
+# TaskInstances within last 7 days in specific state). It turned out that only
+# `failed` state matters.
 cat >"$dest" <<EOF
-# HELP airflow_7day_task_count The total number of TaskInstances within last 7 days in specific state.
-# TYPE airflow_7day_task_count gauge
+# HELP airflow_7d_failed_count The total number of failed TaskInstances within last 7 days.
+# TYPE airflow_7d_failed_count gauge
 EOF
 
 /sbin/start-stop-daemon --chuid daemon:daemon \
   --exec /usr/bin/psql --start \
   -- \
   --no-align --tuples-only airflow >>"$dest" <<EOF
-WITH known_state (state) AS ( VALUES
-  ('None'), -- NB: JOIN on NULL is not possible
-  ('removed'),
-  ('scheduled'),
-  ('queued'),
-  ('running'),
-  ('success'),
-  ('shutdown'),
-  ('failed'),
-  ('up_for_retry'),
-  ('upstream_failed'),
-  ('skipped'))
 SELECT
-  'airflow_7day_task_count{state="' || state || '"} ' || COALESCE(cnt, 0)
-FROM
-  known_state
-LEFT JOIN (
-  SELECT COALESCE(state, 'None') as state, COUNT(*) AS cnt
-  FROM task_instance
-  WHERE execution_date >= NOW() - INTERVAL '7 DAYS'
-  GROUP BY state
-) t1 USING (state)
+  'airflow_7d_failed_count{task="' || task_id || '",dag="' || dag_id || '"} ' || SUM(CASE WHEN state = 'failed' THEN 1 ELSE 0 END)
+FROM task_instance
+WHERE execution_date >= NOW() - INTERVAL '7 DAYS'
+GROUP BY task_id, dag_id
 EOF
 
 cp "$dest" /run/nodeexp/airflow.prom

diff --git a/ansible/roles/airflow/tasks/main.yml b/ansible/roles/airflow/tasks/main.yml
@@ -26,7 +26,6 @@
 
 - name: copy nodeexp_airflow check
   copy: src=nodeexp_airflow dest=/etc/cron.hourly/nodeexp_airflow mode=0555 owner=root group=root mode=0555
-  notify: restart af-worker # because the _file_ is bind-monted, not a directory
 
 - name: shared airflow config templates
   file: dest=/srv/etc/af-share-{{ airflow_tag }} state=directory owner=root group=root mode=0750

diff --git a/ansible/roles/blackbox_exporter/tasks/main.yml b/ansible/roles/blackbox_exporter/tasks/main.yml
@@ -28,6 +28,13 @@
     group: root
     remote_src: true
 
+# TODO: AmbientCapabilities needs systemd-229 we currently have systemd-215
+- name: Set cap_net_raw for blackbox_exporter
+  capabilities:
+    path: '{{ blackbox_exporter_path }}/blackbox_exporter'
+    capability: cap_net_raw+ep
+    state: present
+
 - name: Install config file
   template:
     src: blackbox.yml.j2
@@ -41,6 +48,7 @@
 
 - name: Install blackbox_exporter systemd service file
   notify:
+  - systemctl daemon-reload
   - restart blackbox_exporter
   template:
     src: blackbox_exporter.service

diff --git a/ansible/roles/blackbox_exporter/templates/blackbox.yml.j2 b/ansible/roles/blackbox_exporter/templates/blackbox.yml.j2
@@ -25,6 +25,7 @@ modules:
     prober: tcp
     timeout: 5s
     tcp:
+      preferred_ip_protocol: ip4
       query_response:
       - expect: "^SSH-2.0-"
         send: "SSH-2.0-blackbox_exporter OONI-prometheus-0.0\x0d" # WTF: \x0a is auto-added https://github.com/prometheus/blackbox_exporter/blob/master/tcp.go#L127
@@ -43,6 +44,8 @@ modules:
   icmp:
     prober: icmp
     timeout: 5s
+    icmp:
+      preferred_ip_protocol: ip4
 
   tls_snakeoil:
     prober: tcp

diff --git a/ansible/roles/blackbox_exporter/templates/blackbox_exporter.service b/ansible/roles/blackbox_exporter/templates/blackbox_exporter.service
@@ -11,7 +11,8 @@ Description=Prometheus Blackbox Exporter
 User={{ passwd.prombbox.login }}
 Group={{ passwd.prombbox.login }}
 EnvironmentFile=-/etc/default/blackbox_exporter
-AmbientCapabilities=CAP_NET_RAW
+# TODO: AmbientCapabilities needs systemd-229 we currently have systemd-215
+# AmbientCapabilities=CAP_NET_RAW
 CapabilityBoundingSet=CAP_NET_RAW
 ExecStart={{ blackbox_exporter_path }}/blackbox_exporter \
     --config.file={{ blackbox_exporter_base }}/blackbox.yml \

diff --git a/ansible/roles/node_exporter/defaults/main.yml b/ansible/roles/node_exporter/defaults/main.yml
@@ -9,6 +9,9 @@ node_exporter_collectors: >
   --no-collector.arp --no-collector.bcache --no-collector.infiniband
   --no-collector.ipvs --no-collector.wifi --no-collector.zfs
   --collector.ntp --collector.ntp.local-offset-tolerance=5ms
+  {% if ansible_service_mgr == 'systemd' %}
+  --collector.systemd --collector.systemd.unit-whitelist=^$
+  {% endif %}
 
 node_exporter_disk_ignored: ""
 

diff --git a/ansible/roles/node_exporter/files/seeksample.py b/ansible/roles/node_exporter/files/seeksample.py
@@ -117,7 +117,9 @@ def exporter(dataq, fpath):
             '# HELP node_seeksample_timestamp Unixtime of last successful seek() sampling.\n'
             '# TYPE node_seeksample_timestamp gauge\n'
         ]+ ['node_seeksample_timestamp{device="%s"} %f\n' % (d, stats[d]['timestamp']) for d in stats])
-        with open(fpath, 'r+') as fd: # `r+` instead of `w` should prevent memory allocation that may be hard during OOM
+        # `r+` instead of `w` should prevent memory allocation that may be hard during OOM
+        mode = 'r+' if os.path.exists(fpath) else 'w'
+        with open(fpath, mode) as fd:
             size = os.path.getsize(fpath)
             if size > len(prom):
                 prom += '#' * (size - len(prom) - 1) # pseudo-atomicity to have fixed-size file

diff --git a/ansible/roles/node_exporter/tasks/main.yml b/ansible/roles/node_exporter/tasks/main.yml
@@ -29,6 +29,17 @@
     owner: root
     group: root
     remote_src: true # file is ALREADY on the remote system. sigh.
+    creates: '{{ node_exporter_base }}/{{ node_exporter_basename }}/node_exporter'
+
+# for some unknown reason some nodes do not have `dbus`, but systemd depends on it :-/
+- name: install dbus to punch hole to systemd
+  apt:
+    name: dbus
+    state: present
+    update_cache: yes
+    cache_valid_time: 28800
+    install_recommends: false
+  when: ansible_service_mgr == 'systemd'
 
 - name: Install node_exporter systemd service file
   notify:

diff --git a/ansible/roles/prometheus/files/alert_ooni.yml b/ansible/roles/prometheus/files/alert_ooni.yml
@@ -13,6 +13,8 @@ groups:
     annotations:
       summary: '`rsync` exitcode: {{ $value }}'
 
+  # Some nodes may have it broken, but it usually works, here is expr to validate it.
+  # expr: (max without (device) (node_seeksample_timestamp{job="node"}) or up{job="node"}) == 1
   - alert: DiskStuck
     expr: node_time - ignoring(device) group_right node_seeksample_timestamp > 240
     annotations:
@@ -26,9 +28,9 @@ groups:
       description: '{{ if eq $labels.instance "ooni.torproject.org:443" }}Run `make update-site`.{{ end }}'
 
   - alert: AirflowFailed
-    expr: airflow_7day_task_count{state="failed"} > 0
+    expr: airflow_7d_failed_count > 0
     annotations:
-      summary: '{{ $value }} tasks report failures last week'
+      summary: '{{ $value }} task `{{ $labels.task }}` failures last week'
 
   - alert: SlacktopusGone
     expr: time() - slacktopus_ctcp_pong_time > 660 # it's randomized */5 crontab

diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml
@@ -21,12 +21,56 @@ groups:
     annotations:
       summary: '{{ $labels.instance }} is not `up`'
 
+  - alert: systemd # yes, just "systemd", it's unclear what's going wrong :-)
+    expr: node_systemd_system_running != 1 # that's basically output of `systemctl is-system-running`
+    annotations:
+      summary: '{{ $labels.instance }} is not OK, check `systemctl list-units | grep failed`'
+
   - alert: IOWaitHigh
-    expr: max(irate(node_cpu{mode="iowait"}[1m])) by (instance) > 0.9
+    expr: sum without (cpu) (irate(node_cpu{mode="iowait"}[1m])) > 0.9
     for: 5m # matters to avoid spikes
     annotations:
       summary: '{{ $labels.instance }} %iowait over 90%'
 
+  # the difference between node_disk_{io,read,write}_time_ms is not clear, `io` is NOT `read + write`, it may be greater, it may be less...
+  # All the nodes have `node_disk_io_time_ms`, but it can be verified with expr: (sum without(device) (node_disk_io_time_ms{job="node"}) or up{job="node"}) == 1
+  - alert: IOHigh
+    expr: irate(node_disk_io_time_ms{device!~"(nbd[0-9]+|dm-[0-9]+|ram[0-9]+|sr[0-9]+|md[0-9]+)"}[1m]) > 800
+    for: 2h
+    annotations:
+      summary: '{{ $labels.instance }}/{{ $labels.device }} spends {{ $value }}ms/s in IO over 2 hours'
+
+  - alert: CPUHigh
+    expr: sum without (mode, cpu) (irate(node_cpu{mode!="idle"}[1m])) > 0.75
+    for: 8h
+    annotations:
+      summary: '{{ $labels.instance }} has {{ printf "%.2f" $value }} CPU used over 8 hours'
+
+  - alert: NetworkRXHigh
+    expr: irate(node_network_receive_bytes{device!~"(docker0|veth[0-9a-f]{7}|lo|br[-a-z].*|dummy0)"}[1m]) * 8 > 50 * 1024 * 1024 # OONITestHelper has BandwidthRate 20MBits
+    for: 1h
+    annotations:
+      summary: '{{ $labels.instance }}/{{ $labels.device }} gets {{ $value | humanize }}bit/s'
+
+  - alert: NetworkTXHigh
+    expr: irate(node_network_transmit_bytes{device!~"(docker0|veth[0-9a-f]{7}|lo|br[-a-z].*|dummy0)"}[1m]) * 8 > 50 * 1024 * 1024 # OONITestHelper has BandwidthRate 20MBits
+    for: 1h
+    annotations:
+      summary: '{{ $labels.instance }}/{{ $labels.device }} sends {{ $value | humanize }}bit/s'
+
+  - alert: LoadAverageHigh
+    expr: node_load15 > 8 # largest node available has NCPU=4
+    for: 1h
+    annotations:
+      summary: '{{ $labels.instance }} has LA15 {{ printf "%.1f" $value }}'
+
+  # chameleon.infra.ooni.io and b.echo.th.ooni.io do not have MemAvailable signal that is an _estimate_ of RAM available for userspace allocations provided by kernel.
+  # These old nodes can be listed with expr: (node_memory_MemAvailable or (up - 1)) / node_memory_MemTotal == 0
+  - alert: MemoryLow
+    expr: ((node_memory_MemAvailable) / node_memory_MemTotal or ((node_memory_MemFree + node_memory_Buffers + node_memory_Cached) / node_memory_MemTotal)) * 100 < 20
+    annotations:
+      summary: '{{ $labels.instance }} has {{ printf "%.0f" $value }}% RAM available'
+
   - alert: NodeSwapping
     expr: rate(node_vmstat_pgmajfault[1m]) > 100 # and instance:node_memory_available:ratio < 0.05
     for: 15m # number stolen from https://dev.gitlab.org/cookbooks/runbooks/blob/master/rules/node.yml
@@ -95,4 +139,11 @@ groups:
     annotations:
       summary: '`certbot` failed at {{ $labels.instance }}'
       description: 'SSL cert for  expires in {{ printf "%.0f" $value }} days.'
+
+- name: Prometheus self-care
+  rules:
+  - alert: AlertmanagerNotificationsFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    annotations:
+      summary: 'Alertmanager {{ $labels.instance }} fails {{ $labels.integration }} notifications.'
 ...