Improve alerting

- Add more alerting test domains - alert if at least 2 domains fail - don't test mail on domains that have no MX - also measure if probe passed or not - don't measure scores for probes that don't count towards the total score (appsecpriv) - add metrics for failed probes on timeout
internetstandards · Sep 18, 2024 · 629b5fb · 629b5fb
1 parent 6d587d8
commit 629b5fb
Show file tree

Hide file tree

Showing 3 changed files with 429 additions and 52 deletions.
diff --git a/docker/cron/periodic/15min/tests.py b/docker/cron/periodic/15min/tests.py
@@ -33,11 +33,30 @@
 URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
 HEADERS = {"Host": INTERNETNL_DOMAINNAME}
 
-# domain's to use in website tests
-WEBSITE_TEST_DOMAINS = [
-    "example.nl",
-    "example.com",
-]
+TEST_DOMAINS = {
+    # domain's to use in website tests
+    "site": [
+        "internet.nl",
+        "example.nl",
+        "example.com",
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+    # domain's to use in mail tests
+    "mail": [
+        "internet.nl",
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+}
 
 
 METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
@@ -46,12 +65,13 @@
     "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
 )
 METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
+METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"])
 
 METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
 METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
 METRIC_TEST_FAILURE = Gauge("tests_test_failure_total", "Test runs that failed.", ["test", "domain"])
 METRIC_TEST_SUCCESS = Gauge("tests_test_success_total", "Test runs that succeeded.", ["test", "domain"])
-METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout", "Test that ran into timeout.", ["test", "domain"])
+METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout_total", "Test that ran into timeout.", ["test", "domain"])
 METRIC_TEST_RUNTIME = Gauge("tests_test_runtime_seconds", "Amount of time test ran before done.", ["test", "domain"])
 
 
@@ -99,11 +119,20 @@ def run_tests_on_domain(test, domain):
 
         # stop when all probes are finished
         if not [p for p in probes if not p["done"]]:
+            METRIC_TEST_SUCCESS.labels(test, domain).set(1)
             break
 
         time.sleep(1)
     else:
+        # test timed out because one or more of the probes was not done within time
         METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
+        for probe in probes:
+            if probe["name"] in finished_probes:
+                continue
+            # record not finished probes as failed
+            METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
+            METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
+            METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
 
     METRIC_TEST_RUNTIME.labels(test, domain).set(int(time.time() - test_start))
 
@@ -119,18 +148,23 @@ def run_tests_on_domain(test, domain):
             r.raise_for_status()
             if r.status_code == 200:
                 probe_result = r.json()
-                METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+                # only measure probe scores that count towards total score
+                if probe_result["maxscore"]:
+                    METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+                METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
         except Exception:
             log.exception("failed to get probe score")
 
 
 def run_tests():
     for test in TESTS:
-        for domain in WEBSITE_TEST_DOMAINS:
+        for domain in TEST_DOMAINS[test]:
             log.info(f"testing: {test} {domain}")
             METRIC_TEST_RUN.labels(test, domain).set(1)
             METRIC_TEST_CACHE.labels(test, domain).set(0)
             METRIC_TEST_FAILURE.labels(test, domain).set(0)
+            METRIC_TEST_TIMEOUT.labels(test, domain).set(0)
+            METRIC_TEST_SUCCESS.labels(test, domain).set(0)
             try:
                 run_tests_on_domain(test, domain)
             except Exception:

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -1072,12 +1072,23 @@ configs:
       groups:
       - name: End to end monitoring
         rules:
-        - alert: HighTestRuntime
-          expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30
+        - alert: HighTestRuntimeSite
+          # when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong
+          expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2
           annotations:
             host: $INTERNETNL_DOMAINNAME
-            summary: Tests/probes take longer to complete than expected
+            summary: Two or more tests for web take longer to complete than expected
             dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'
+        - alert: HighTestRuntimeMail
+          # when mail probes for 2 or more of the test domains take longer than 30 seconds something is wrong
+          # minez.nl and forumstandaardisatie.nl currently time out on mail test tls probe, skipping for now, should be solved when switching to sslyze
+          expr: count(tests_test_runtime_seconds{test="site", domain!~"minez.nl|forumstandaardisatie.nl"} >= 30) >= 2
+          annotations:
+            host: $INTERNETNL_DOMAINNAME
+            summary: Two or more tests for mail take longer to complete than expected
+            dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'
+
+
   alertmanager_config:
     content: |
       global: